In [2]:
%pip install -r requirements.txt

Collecting baukit@ git+https://github.com/davidbau/baukit@e14a18a6ad6cf9e0d6a5dc7a97e671e393c01682 (from -r requirements.txt (line 11))
  Using cached baukit-0.0.1-py3-none-any.whl
Collecting neuron-explainer@ git+https://github.com/openai/automated-interpretability.git@8be455788f43a603381e3c1b38a697ad4797a90f#subdirectory=neuron-explainer (from -r requirements.txt (line 85))
  Using cached neuron_explainer-0.0.1-py3-none-any.whl
Collecting transformer-lens@ git+https://github.com/neelnanda-io/TransformerLens@ae32fa54ad40cb2c3f3a60f1837d0b4899c8daae (from -r requirements.txt (line 164))
  Using cached transformer_lens-0.0.0-py3-none-any.whl
Collecting aiohttp==3.8.5 (from -r requirements.txt (line 1))
  Using cached aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiosignal==1.3.1 (from -r requirements.txt (line 2))
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting anyio==3.7.0 (from -r requirements.txt (line 3))
  

In [26]:
import torch 
import argparse
from utils import dotdict
from activation_dataset import setup_token_data
import wandb
import json
from datetime import datetime
from tqdm import tqdm
from einops import rearrange
import matplotlib.pyplot as plt

cfg = dotdict()
# models: "EleutherAI/pythia-6.9b", "lomahony/eleuther-pythia6.9b-hh-sft", "usvsnsp/pythia-6.9b-ppo", "Dahoas/gptj-rm-static", "reciprocate/dahoas-gptj-rm-static"
# cfg.model_name="lomahony/eleuther-pythia6.9b-hh-sft"
# "EleutherAI/pythia-70m", "lomahony/pythia-70m-helpful-sft", "lomahony/eleuther-pythia70m-hh-sft"
cfg.model_name="EleutherAI/pythia-70m-deduped"
cfg.layers=[0,]
cfg.setting="residual"
# cfg.tensor_name="gpt_neox.layers.{layer}" or "transformer.h.{layer}"
cfg.tensor_name="gpt_neox.layers.{layer}"
original_l1_alpha = 8e-4
cfg.l1_alpha=original_l1_alpha
cfg.l1_alphas=[8e-5, 1e-4, 2e-4, 4e-4, 8e-4, 1e-3, 2e-3, 4e-3, 8e-3]
cfg.sparsity=None
cfg.num_epochs=10
cfg.model_batch_size=8
cfg.lr=1e-3
cfg.kl=False
cfg.reconstruction=False
#cfg.dataset_name="NeelNanda/pile-10k"
cfg.dataset_name="Elriggs/openwebtext-100k"
cfg.device="cuda:0"
cfg.ratio = 4
cfg.seed = 0
# cfg.device="cpu"

In [27]:
tensor_names = [cfg.tensor_name.format(layer=layer) for layer in cfg.layers]

In [28]:
# Load in the model
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, GPTJForSequenceClassification
model = AutoModelForCausalLM.from_pretrained(cfg.model_name)
model = model.to(cfg.device)
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

Downloading config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [29]:
# Download the dataset
# TODO iteratively grab dataset?
cfg.max_length = 256
token_loader = setup_token_data(cfg, tokenizer, model, seed=cfg.seed)
num_tokens = cfg.max_length*cfg.model_batch_size*len(token_loader)
print(f"Number of tokens: {num_tokens}")

Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Map (num_proc=8):   0%|          | 0/100000 [00:00<?, ? examples/s]

Number of tokens: 112750592


In [30]:
# Run 1 datapoint on model to get the activation size
from baukit import Trace

text = "1"
tokens = tokenizer(text, return_tensors="pt").input_ids.to(cfg.device)
# Your activation name will be different. In the next cells, we will show you how to find it.
with torch.no_grad():
    with Trace(model, tensor_names[0]) as ret:
        _ = model(tokens)
        representation = ret.output
        # check if instance tuple
        if(isinstance(representation, tuple)):
            representation = representation[0]
        activation_size = representation.shape[-1]
print(f"Activation size: {activation_size}")

Activation size: 512


In [6]:
%pip install torch

[0mNote: you may need to restart the kernel to use updated packages.


In [14]:
%pip install --upgrade functorch

Collecting functorch
  Downloading functorch-2.0.0-py2.py3-none-any.whl (2.1 kB)
Installing collected packages: functorch
Successfully installed functorch-2.0.0
[0mNote: you may need to restart the kernel to use updated packages.


In [31]:
# Initialize New autoencoder
from autoencoders.learned_dict import AnthropicSAE
from torch import nn
autoencoders = []
optimizers = []
for layer in cfg.layers:
    l1_variants = []
    l1_optimizers = []
    for l1 in cfg.l1_alphas:
        params = dict()
        n_dict_components = activation_size*cfg.ratio
        params["encoder"] = torch.empty((n_dict_components, activation_size), device=cfg.device)
        nn.init.xavier_uniform_(params["encoder"])

        params["decoder"] = torch.empty((n_dict_components, activation_size), device=cfg.device)
        nn.init.xavier_uniform_(params["decoder"])

        params["encoder_bias"] = torch.empty((n_dict_components,), device=cfg.device)
        nn.init.zeros_(params["encoder_bias"])

        params["shift_bias"] = torch.empty((activation_size,), device=cfg.device)
        nn.init.zeros_(params["shift_bias"])

        autoencoder = AnthropicSAE(  # TiedSAE, UntiedSAE, AnthropicSAE
            # n_feats = n_dict_components, 
            # activation_size=activation_size,
            encoder=params["encoder"],
            encoder_bias=params["encoder_bias"],
            decoder=params["decoder"],
            shift_bias=params["shift_bias"],
        )
        autoencoder.to_device(cfg.device)
        autoencoder.set_grad()
        l1_variants.append(autoencoder)

        optimizer = torch.optim.Adam(
            [
                autoencoder.encoder, 
                autoencoder.encoder_bias,
                autoencoder.decoder,
                autoencoder.shift_bias,
            ], lr=cfg.lr)
        l1_optimizers.append(optimizer)
        
    autoencoders.append(l1_variants)
    optimizers.append(l1_optimizers)

In [32]:
# Set target sparsity to 10% of activation_size if not set
if cfg.sparsity is None:
    cfg.sparsity = int(activation_size*0.05)
    print(f"Target sparsity: {cfg.sparsity}")

target_lower_sparsity = cfg.sparsity * 0.9
target_upper_sparsity = cfg.sparsity * 1.1
adjustment_factor = 0.1  # You can set this to whatever you like

Target sparsity: 25


In [33]:
original_bias = autoencoder.encoder_bias.clone().detach()
# Wandb setup
secrets = json.load(open("secrets.json"))
wandb.login(key=secrets["wandb_key"])
start_time = datetime.now().strftime("%Y%m%d-%H%M%S")
wandb_run_name = f"{cfg.model_name}_{start_time[4:]}_{cfg.sparsity}"  # trim year
print(f"wandb_run_name: {wandb_run_name}")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


wandb_run_name: EleutherAI/pythia-70m-deduped_1210-055534_25


In [34]:
wandb.init(project="sparse coding", config=dict(cfg), name=wandb_run_name)

In [None]:
# time_since_activation = torch.zeros(autoencoder.encoder.shape[0])
total_activations = torch.zeros(autoencoder.encoder.shape[0])
max_num_tokens = 100_000_000
save_every = 30_000
num_saved_so_far = 0
# Freeze model parameters 
model.eval()
model.requires_grad_(False)
model.to(cfg.device)
# last_encoder = autoencoder.encoder.clone().detach()
for i, batch in enumerate(tqdm(token_loader,total=int(max_num_tokens/(cfg.max_length*cfg.model_batch_size)))):
    tokens = batch["input_ids"].to(cfg.device)
    with torch.no_grad(): # As long as not doing KL divergence, don't need gradients for model
        
        #print(tensor_names)
        
        with Trace(model, tensor_names[0]) as ret:
            _ = model(tokens)
            representation = ret.output
            if(isinstance(representation, tuple)):
                representation = representation[0]
    #print(f"representation is: {representation}")
    print(f"representation shape is: {representation.shape}")
    layer_activations = rearrange(representation, "b seq d_model -> (b seq) d_model")
    # activation_saver.save_batch(layer_activations.clone().cpu().detach())
    for layer in range(len(cfg.layers)):
        for l1_alpha in range(len(cfg.l1_alphas)):
            autoencoder = autoencoders[layer][l1_alpha]
            optimizer = optimizers[layer][l1_alpha]
            
            c = autoencoder.encode(layer_activations)
            x_hat = autoencoder.decode(c)
            
            reconstruction_loss = (x_hat - layer_activations).pow(2).mean()
            l1_loss = torch.norm(c, 1, dim=-1).mean()
            total_loss = reconstruction_loss + cfg.l1_alpha*l1_loss

            time_since_activation += 1
            time_since_activation = time_since_activation * (c.sum(dim=0).cpu()==0)
            # total_activations += c.sum(dim=0).cpu()
            if ((i) % 100 == 0): # Check here so first check is model w/o change
                # self_similarity = torch.cosine_similarity(c, last_encoder, dim=-1).mean().cpu().item()
                # Above is wrong, should be similarity between encoder and last encoder
                self_similarity = torch.cosine_similarity(autoencoder.encoder, last_encoder, dim=-1).mean().cpu().item()
                last_encoder = autoencoder.encoder.clone().detach()

                num_tokens_so_far = i*cfg.max_length*cfg.model_batch_size
                with torch.no_grad():
                    sparsity = (c != 0).float().mean(dim=0).sum().cpu().item()
                    # Count number of dead_features are zero
                    num_dead_features = (time_since_activation >= min(i, 200)).sum().item()
                print(f"Sparsity: {sparsity:.1f} | Dead Features: {num_dead_features} | Total Loss: {total_loss:.2f} | Reconstruction Loss: {reconstruction_loss:.2f} | L1 Loss: {cfg.l1_alpha*l1_loss:.2f} | l1_alpha: {cfg.l1_alpha:.2e} | Tokens: {num_tokens_so_far} | Self Similarity: {self_similarity:.2f}")
                wandb.log({
                    'Sparsity': sparsity,
                    'Dead Features': num_dead_features,
                    'Total Loss': total_loss.item(),
                    'Reconstruction Loss': reconstruction_loss.item(),
                    'L1 Loss': (cfg.l1_alpha*l1_loss).item(),
                    'l1_alpha': cfg.l1_alpha,
                    'Tokens': num_tokens_so_far,
                    'Self Similarity': self_similarity
                })
                
                dead_features = torch.zeros(autoencoder.encoder.shape[0])
                
                if(num_tokens_so_far > max_num_tokens):
                    print(f"Reached max number of tokens: {max_num_tokens}")
                    break
                
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()
    
    # resample_period = 10000
    # if (i % resample_period == 0):
    #     # RESAMPLING
    #     with torch.no_grad():
    #         # Count number of dead_features are zero
    #         num_dead_features = (total_activations == 0).sum().item()
    #         print(f"Dead Features: {num_dead_features}")
            
    #     if num_dead_features > 0:
    #         print("Resampling!")
    #         # hyperparams:
    #         max_resample_tokens = 1000 # the number of token activations that we consider for inserting into the dictionary
    #         # compute loss of model on random subset of inputs
    #         resample_loader = setup_token_data(cfg, tokenizer, model, seed=i)
    #         num_resample_data = 0

    #         resample_activations = torch.empty(0, activation_size)
    #         resample_losses = torch.empty(0)

    #         for resample_batch in resample_loader:
    #             resample_tokens = resample_batch["input_ids"].to(cfg.device)
    #             with torch.no_grad(): # As long as not doing KL divergence, don't need gradients for model
    #                 with Trace(model, tensor_names[0]) as ret:
    #                     _ = model(resample_tokens)
    #                     representation = ret.output
    #                     if(isinstance(representation, tuple)):
    #                         representation = representation[0]
    #             layer_activations = rearrange(representation, "b seq d_model -> (b seq) d_model")
    #             resample_activations = torch.cat((resample_activations, layer_activations.detach().cpu()), dim=0)

    #             c = autoencoder.encode(layer_activations)
    #             x_hat = autoencoder.decode(c)
                
    #             reconstruction_loss = (x_hat - layer_activations).pow(2).mean(dim=-1)
    #             l1_loss = torch.norm(c, 1, dim=-1)
    #             temp_loss = reconstruction_loss + cfg.l1_alpha*l1_loss
                
    #             resample_losses = torch.cat((resample_losses, temp_loss.detach().cpu()), dim=0)
                
    #             num_resample_data +=layer_activations.shape[0]
    #             if num_resample_data > max_resample_tokens:
    #                 break

                
    #         # sample num_dead_features vectors of input activations
    #         probabilities = resample_losses**2
    #         probabilities /= probabilities.sum()
    #         sampled_indices = torch.multinomial(probabilities, num_dead_features, replacement=True)
    #         new_vectors = resample_activations[sampled_indices]

    #         # calculate average encoder norm of alive neurons
    #         alive_neurons = list((total_activations!=0))
    #         modified_columns = total_activations==0
    #         avg_norm = autoencoder.encoder.data[alive_neurons].norm(dim=-1).mean()

    #         # replace dictionary and encoder weights with vectors
    #         new_vectors = new_vectors / new_vectors.norm(dim=1, keepdim=True)
            
    #         params_to_modify = [autoencoder.encoder, autoencoder.encoder_bias]

    #         current_weights = autoencoder.encoder.data
    #         current_weights[modified_columns] = (new_vectors.to(cfg.device) * avg_norm * 0.02)
    #         autoencoder.encoder.data = current_weights

    #         current_weights = autoencoder.encoder_bias.data
    #         current_weights[modified_columns] = 0
    #         autoencoder.encoder_bias.data = current_weights
            
    #         if hasattr(autoencoder, 'decoder'):
    #             current_weights = autoencoder.decoder.data
    #             current_weights[modified_columns] = new_vectors.to(cfg.device)
    #             autoencoder.decoder.data = current_weights
    #             params_to_modify += [autoencoder.decoder]

    #         for param_group in optimizer.param_groups:
    #             for param in param_group['params']:
    #                 if any(param is d_ for d_ in params_to_modify):
    #                     # Extract the corresponding rows from m and v
    #                     m = optimizer.state[param]['exp_avg']
    #                     v = optimizer.state[param]['exp_avg_sq']
                        
    #                     # Update the m and v values for the modified columns
    #                     m[modified_columns] = 0  # Reset moving average for modified columns
    #                     v[modified_columns] = 0  # Reset squared moving average for modified columns
        
    #     total_activations = torch.zeros(autoencoder.encoder.shape[0])
    
    
    

    # if ((i+2) % save_every ==0): # save periodically but before big changes
    #     model_save_name = cfg.model_name.split("/")[-1]
    #     save_name = f"{model_save_name}_sp{cfg.sparsity}_r{cfg.ratio}_{tensor_names[0]}_ckpt{num_saved_so_far}"  # trim year

    #     # Make directory traiend_models if it doesn't exist
    #     import os
    #     if not os.path.exists("trained_models"):
    #         os.makedirs("trained_models")
    #     # Save model
    #     torch.save(autoencoder, f"trained_models/{save_name}.pt")
        
    #     num_saved_so_far += 1

    # # Running sparsity check
    # num_tokens_so_far = i*cfg.max_length*cfg.model_batch_size
    # if(num_tokens_so_far > 200000):
    #     if(i % 100 == 0):
    #         with torch.no_grad():
    #             sparsity = (c != 0).float().mean(dim=0).sum().cpu().item()
    #         if sparsity > target_upper_sparsity:
    #             cfg.l1_alpha *= (1 + adjustment_factor)
    #         elif sparsity < target_lower_sparsity:
    #             cfg.l1_alpha *= (1 - adjustment_factor)
    #         # print(f"Sparsity: {sparsity:.1f} | l1_alpha: {cfg.l1_alpha:.2e}")

  0%|          | 1/48828 [00:00<2:19:28,  5.83it/s]

representation shape is: torch.Size([8, 256, 512])
Sparsity: 1031.1 | Dead Features: 2048 | Total Loss: 0.32 | Reconstruction Loss: 0.19 | L1 Loss: 0.13 | l1_alpha: 8.00e-04 | Tokens: 0 | Self Similarity: -0.00
Sparsity: 1030.8 | Dead Features: 2048 | Total Loss: 0.32 | Reconstruction Loss: 0.19 | L1 Loss: 0.13 | l1_alpha: 8.00e-04 | Tokens: 0 | Self Similarity: 0.00
Sparsity: 1004.0 | Dead Features: 2048 | Total Loss: 0.31 | Reconstruction Loss: 0.18 | L1 Loss: 0.13 | l1_alpha: 8.00e-04 | Tokens: 0 | Self Similarity: -0.00
Sparsity: 1023.5 | Dead Features: 2048 | Total Loss: 0.31 | Reconstruction Loss: 0.18 | L1 Loss: 0.13 | l1_alpha: 8.00e-04 | Tokens: 0 | Self Similarity: -0.00
Sparsity: 1026.6 | Dead Features: 2048 | Total Loss: 0.32 | Reconstruction Loss: 0.19 | L1 Loss: 0.13 | l1_alpha: 8.00e-04 | Tokens: 0 | Self Similarity: 0.00
Sparsity: 1008.9 | Dead Features: 2048 | Total Loss: 0.31 | Reconstruction Loss: 0.18 | L1 Loss: 0.13 | l1_alpha: 8.00e-04 | Tokens: 0 | Self Similarit

  0%|          | 5/48828 [00:00<1:22:25,  9.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 7/48828 [00:00<1:19:06, 10.29it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 11/48828 [00:01<1:16:19, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 13/48828 [00:01<1:15:16, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 17/48828 [00:01<1:14:25, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 19/48828 [00:01<1:14:49, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 23/48828 [00:02<1:14:12, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 25/48828 [00:02<1:13:48, 11.02it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 29/48828 [00:02<1:13:40, 11.04it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 31/48828 [00:02<1:13:56, 11.00it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 35/48828 [00:03<1:13:20, 11.09it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 37/48828 [00:03<1:13:13, 11.11it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 41/48828 [00:03<1:13:47, 11.02it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 43/48828 [00:03<1:13:38, 11.04it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 47/48828 [00:04<1:13:05, 11.12it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 49/48828 [00:04<1:13:21, 11.08it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 53/48828 [00:04<1:13:34, 11.05it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 55/48828 [00:05<1:13:30, 11.06it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 59/48828 [00:05<1:13:15, 11.09it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 61/48828 [00:05<1:13:49, 11.01it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 65/48828 [00:05<1:13:25, 11.07it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 67/48828 [00:06<1:13:07, 11.11it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 71/48828 [00:06<1:13:45, 11.02it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 73/48828 [00:06<1:13:49, 11.01it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 77/48828 [00:07<1:14:21, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 79/48828 [00:07<1:14:18, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 83/48828 [00:07<1:14:06, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 85/48828 [00:07<1:14:07, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 89/48828 [00:08<1:13:33, 11.04it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 91/48828 [00:08<1:13:41, 11.02it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 95/48828 [00:08<1:14:28, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 97/48828 [00:08<1:14:24, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 101/48828 [00:09<1:15:22, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 86.2 | Dead Features: 0 | Total Loss: 0.04 | Reconstruction Loss: 0.03 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 204800 | Self Similarity: 0.00
Sparsity: 85.6 | Dead Features: 0 | Total Loss: 0.04 | Reconstruction Loss: 0.03 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 204800 | Self Similarity: 0.08
Sparsity: 85.7 | Dead Features: 0 | Total Loss: 0.04 | Reconstruction Loss: 0.03 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 204800 | Self Similarity: 0.08
Sparsity: 85.3 | Dead Features: 0 | Total Loss: 0.04 | Reconstruction Loss: 0.03 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 204800 | Self Similarity: 0.08
Sparsity: 87.0 | Dead Features: 0 | Total Loss: 0.04 | Reconstruction Loss: 0.03 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 204800 | Self Similarity: 0.08
Sparsity: 90.3 | Dead Features: 0 | Total Loss: 0.04 | Reconstruction Loss: 0.03 | L1 Loss: 0.01 | l1_a

  0%|          | 103/48828 [00:09<1:14:44, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 107/48828 [00:09<1:14:31, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 109/48828 [00:09<1:14:16, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 113/48828 [00:10<1:13:41, 11.02it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 115/48828 [00:10<1:13:29, 11.05it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 119/48828 [00:10<1:14:09, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 121/48828 [00:11<1:13:56, 10.98it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 125/48828 [00:11<1:13:19, 11.07it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 127/48828 [00:11<1:13:27, 11.05it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 131/48828 [00:11<1:13:51, 10.99it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 133/48828 [00:12<1:13:35, 11.03it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 137/48828 [00:12<1:13:20, 11.06it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 139/48828 [00:12<1:14:00, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 143/48828 [00:13<1:13:47, 11.00it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 145/48828 [00:13<1:13:26, 11.05it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 149/48828 [00:13<1:13:31, 11.04it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 151/48828 [00:13<1:13:58, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 155/48828 [00:14<1:13:37, 11.02it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 157/48828 [00:14<1:13:27, 11.04it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 161/48828 [00:14<1:13:55, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 163/48828 [00:14<1:14:05, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 167/48828 [00:15<1:13:32, 11.03it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 169/48828 [00:15<1:13:23, 11.05it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 173/48828 [00:15<1:14:04, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 175/48828 [00:15<1:13:53, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 179/48828 [00:16<1:13:24, 11.05it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 181/48828 [00:16<1:13:27, 11.04it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 185/48828 [00:16<1:13:53, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 187/48828 [00:17<1:13:36, 11.01it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 191/48828 [00:17<1:13:15, 11.06it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 193/48828 [00:17<1:13:41, 11.00it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 197/48828 [00:17<1:13:54, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 199/48828 [00:18<1:13:31, 11.02it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 73.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 409600 | Self Similarity: 0.08


  0%|          | 201/48828 [00:18<1:15:05, 10.79it/s]

Sparsity: 72.7 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 409600 | Self Similarity: 0.08
Sparsity: 73.2 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 409600 | Self Similarity: 0.08
Sparsity: 72.7 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 409600 | Self Similarity: 0.08
Sparsity: 75.2 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 409600 | Self Similarity: 0.08
Sparsity: 75.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 409600 | Self Similarity: 0.08
Sparsity: 73.3 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 409600 | Self Similarity: 0.08
Sparsity: 73.4 | Dead Features: 0 | Total Loss

  0%|          | 205/48828 [00:18<1:14:19, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 207/48828 [00:18<1:14:36, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 211/48828 [00:19<1:13:49, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 213/48828 [00:19<1:13:29, 11.03it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 217/48828 [00:19<1:13:46, 10.98it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 219/48828 [00:19<1:13:53, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 223/48828 [00:20<1:13:22, 11.04it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 225/48828 [00:20<1:13:12, 11.07it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 229/48828 [00:20<1:13:58, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 231/48828 [00:21<1:13:51, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 235/48828 [00:21<1:14:02, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 237/48828 [00:21<1:14:15, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 241/48828 [00:22<1:14:34, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  0%|          | 243/48828 [00:22<1:14:14, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 247/48828 [00:22<1:14:03, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 249/48828 [00:22<1:14:17, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 253/48828 [00:23<1:15:27, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 255/48828 [00:23<1:15:17, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 259/48828 [00:23<1:14:23, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 261/48828 [00:23<1:14:20, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 265/48828 [00:24<1:15:04, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 267/48828 [00:24<1:15:05, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 271/48828 [00:24<1:14:42, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 273/48828 [00:24<1:14:22, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 277/48828 [00:25<1:14:27, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 279/48828 [00:25<1:14:54, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 283/48828 [00:25<1:14:37, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 285/48828 [00:26<1:14:15, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 289/48828 [00:26<1:14:22, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 291/48828 [00:26<1:14:37, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 295/48828 [00:26<1:14:35, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 297/48828 [00:27<1:14:26, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 301/48828 [00:27<1:15:49, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 68.8 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 614400 | Self Similarity: 0.08
Sparsity: 69.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 614400 | Self Similarity: 0.08
Sparsity: 68.9 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 614400 | Self Similarity: 0.08
Sparsity: 69.2 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 614400 | Self Similarity: 0.08
Sparsity: 70.4 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 614400 | Self Similarity: 0.08
Sparsity: 70.6 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_a

  1%|          | 303/48828 [00:27<1:15:25, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 307/48828 [00:28<1:15:25, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 309/48828 [00:28<1:15:04, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 313/48828 [00:28<1:14:30, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 315/48828 [00:28<1:14:43, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 319/48828 [00:29<1:15:11, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 321/48828 [00:29<1:15:15, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 325/48828 [00:29<1:15:38, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 327/48828 [00:29<1:15:39, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 331/48828 [00:30<1:14:47, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 333/48828 [00:30<1:15:16, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 337/48828 [00:30<1:14:54, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 339/48828 [00:31<1:14:36, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 343/48828 [00:31<1:14:36, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 345/48828 [00:31<1:14:41, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 349/48828 [00:31<1:14:57, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 351/48828 [00:32<1:14:24, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 355/48828 [00:32<1:13:47, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 357/48828 [00:32<1:13:54, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 361/48828 [00:33<1:14:10, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 363/48828 [00:33<1:13:50, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 367/48828 [00:33<1:13:32, 10.98it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 369/48828 [00:33<1:13:40, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 373/48828 [00:34<1:13:53, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 375/48828 [00:34<1:13:41, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 379/48828 [00:34<1:13:27, 10.99it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 381/48828 [00:34<1:13:37, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 385/48828 [00:35<1:13:43, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 387/48828 [00:35<1:14:09, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 391/48828 [00:35<1:14:04, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 393/48828 [00:36<1:14:08, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 397/48828 [00:36<1:13:52, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 399/48828 [00:36<1:13:41, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 65.0 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 819200 | Self Similarity: 0.08


  1%|          | 401/48828 [00:36<1:15:05, 10.75it/s]

Sparsity: 66.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 819200 | Self Similarity: 0.08
Sparsity: 65.4 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 819200 | Self Similarity: 0.08
Sparsity: 65.4 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 819200 | Self Similarity: 0.08
Sparsity: 66.9 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 819200 | Self Similarity: 0.08
Sparsity: 66.6 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 819200 | Self Similarity: 0.08
Sparsity: 65.6 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 819200 | Self Similarity: 0.08
Sparsity: 65.1 | Dead Features: 0 | Total Loss

  1%|          | 405/48828 [00:37<1:14:24, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 407/48828 [00:37<1:14:29, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 411/48828 [00:37<1:13:54, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 413/48828 [00:37<1:13:36, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 417/48828 [00:38<1:13:38, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 419/48828 [00:38<1:14:10, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 423/48828 [00:38<1:13:34, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 425/48828 [00:38<1:13:21, 11.00it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 429/48828 [00:39<1:13:35, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 431/48828 [00:39<1:13:57, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 435/48828 [00:39<1:13:37, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 437/48828 [00:40<1:13:34, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 441/48828 [00:40<1:13:50, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 443/48828 [00:40<1:13:50, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 447/48828 [00:40<1:13:19, 11.00it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 449/48828 [00:41<1:13:10, 11.02it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 453/48828 [00:41<1:13:55, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 455/48828 [00:41<1:13:45, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 459/48828 [00:42<1:13:54, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 461/48828 [00:42<1:13:41, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 465/48828 [00:42<1:14:00, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 467/48828 [00:42<1:14:01, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 471/48828 [00:43<1:13:42, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 473/48828 [00:43<1:13:56, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 477/48828 [00:43<1:14:35, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 479/48828 [00:43<1:14:28, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 483/48828 [00:44<1:13:56, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 485/48828 [00:44<1:13:49, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 489/48828 [00:44<1:13:56, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 491/48828 [00:44<1:13:50, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 495/48828 [00:45<1:13:23, 10.98it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 497/48828 [00:45<1:13:17, 10.99it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 501/48828 [00:45<1:15:20, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 66.2 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1024000 | Self Similarity: 0.08
Sparsity: 67.0 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1024000 | Self Similarity: 0.08
Sparsity: 66.8 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1024000 | Self Similarity: 0.08
Sparsity: 66.5 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1024000 | Self Similarity: 0.08
Sparsity: 67.4 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1024000 | Self Similarity: 0.08
Sparsity: 67.4 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 |

  1%|          | 503/48828 [00:46<1:15:05, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 507/48828 [00:46<1:13:59, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 509/48828 [00:46<1:13:37, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 513/48828 [00:47<1:13:37, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 515/48828 [00:47<1:13:35, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 519/48828 [00:47<1:13:11, 11.00it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 521/48828 [00:47<1:13:06, 11.01it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 525/48828 [00:48<1:13:51, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 527/48828 [00:48<1:13:44, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 531/48828 [00:48<1:13:14, 10.99it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 533/48828 [00:48<1:13:14, 10.99it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 537/48828 [00:49<1:13:50, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 539/48828 [00:49<1:13:30, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 543/48828 [00:49<1:13:08, 11.00it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 545/48828 [00:49<1:13:14, 10.99it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 549/48828 [00:50<1:13:57, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 551/48828 [00:50<1:14:01, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 555/48828 [00:50<1:13:27, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 557/48828 [00:51<1:13:32, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 561/48828 [00:51<1:14:13, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 563/48828 [00:51<1:13:52, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 567/48828 [00:51<1:13:21, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 569/48828 [00:52<1:13:24, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 573/48828 [00:52<1:14:13, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 575/48828 [00:52<1:13:47, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 579/48828 [00:53<1:13:12, 10.98it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 581/48828 [00:53<1:13:07, 11.00it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 585/48828 [00:53<1:13:49, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 587/48828 [00:53<1:13:37, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 591/48828 [00:54<1:13:11, 10.98it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 593/48828 [00:54<1:13:27, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 597/48828 [00:54<1:13:59, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 599/48828 [00:54<1:13:38, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 70.8 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1228800 | Self Similarity: 0.08


  1%|          | 601/48828 [00:55<1:15:03, 10.71it/s]

Sparsity: 71.5 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1228800 | Self Similarity: 0.08
Sparsity: 71.0 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1228800 | Self Similarity: 0.08
Sparsity: 70.3 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1228800 | Self Similarity: 0.08
Sparsity: 71.6 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1228800 | Self Similarity: 0.08
Sparsity: 70.6 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1228800 | Self Similarity: 0.08
Sparsity: 70.5 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1228800 | Self Similarity: 0.08
Sparsity: 70.2 | Dead Features: 0 | Tota

  1%|          | 605/48828 [00:55<1:13:44, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|          | 607/48828 [00:55<1:13:45, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 611/48828 [00:55<1:13:39, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 613/48828 [00:56<1:13:23, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 617/48828 [00:56<1:13:21, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 619/48828 [00:56<1:13:29, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 623/48828 [00:57<1:13:20, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 625/48828 [00:57<1:13:15, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 629/48828 [00:57<1:13:22, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 631/48828 [00:57<1:14:00, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 635/48828 [00:58<1:14:06, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 637/48828 [00:58<1:13:46, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 641/48828 [00:58<1:13:25, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 643/48828 [00:58<1:13:50, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 647/48828 [00:59<1:13:39, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 649/48828 [00:59<1:13:20, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 653/48828 [00:59<1:13:14, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 655/48828 [01:00<1:13:24, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 659/48828 [01:00<1:13:14, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 661/48828 [01:00<1:13:00, 11.00it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 665/48828 [01:00<1:13:13, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 667/48828 [01:01<1:13:38, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 671/48828 [01:01<1:13:20, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 673/48828 [01:01<1:13:11, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 677/48828 [01:02<1:13:29, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 679/48828 [01:02<1:13:40, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 683/48828 [01:02<1:13:15, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 685/48828 [01:02<1:13:15, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 689/48828 [01:03<1:13:49, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 691/48828 [01:03<1:14:00, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 695/48828 [01:03<1:13:40, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 697/48828 [01:03<1:13:26, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 701/48828 [01:04<1:15:08, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 62.3 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1433600 | Self Similarity: 0.08
Sparsity: 62.7 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1433600 | Self Similarity: 0.08
Sparsity: 63.0 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1433600 | Self Similarity: 0.08
Sparsity: 62.3 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1433600 | Self Similarity: 0.08
Sparsity: 63.0 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1433600 | Self Similarity: 0.08
Sparsity: 62.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  1%|▏         | 703/48828 [01:04<1:15:05, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 707/48828 [01:04<1:14:55, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 709/48828 [01:04<1:14:11, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 713/48828 [01:05<1:13:33, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 715/48828 [01:05<1:13:38, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 719/48828 [01:05<1:14:11, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 721/48828 [01:06<1:13:49, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 725/48828 [01:06<1:13:25, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 727/48828 [01:06<1:13:14, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  1%|▏         | 731/48828 [01:06<1:13:29, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 733/48828 [01:07<1:13:14, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 737/48828 [01:07<1:13:01, 10.98it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 739/48828 [01:07<1:13:11, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 743/48828 [01:08<1:13:32, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 745/48828 [01:08<1:13:15, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 749/48828 [01:08<1:12:55, 10.99it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 751/48828 [01:08<1:13:01, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 755/48828 [01:09<1:13:20, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 757/48828 [01:09<1:13:14, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 761/48828 [01:09<1:12:54, 10.99it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 763/48828 [01:09<1:13:20, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 767/48828 [01:10<1:13:18, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 769/48828 [01:10<1:13:13, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 773/48828 [01:10<1:13:01, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 775/48828 [01:11<1:13:16, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 779/48828 [01:11<1:13:14, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 781/48828 [01:11<1:13:03, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 785/48828 [01:11<1:13:05, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 787/48828 [01:12<1:13:26, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 791/48828 [01:12<1:13:13, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 793/48828 [01:12<1:13:03, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 797/48828 [01:13<1:12:55, 10.98it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 799/48828 [01:13<1:13:17, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 68.2 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1638400 | Self Similarity: 0.08


  2%|▏         | 801/48828 [01:13<1:14:43, 10.71it/s]

Sparsity: 68.5 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1638400 | Self Similarity: 0.08
Sparsity: 68.6 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1638400 | Self Similarity: 0.07
Sparsity: 68.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1638400 | Self Similarity: 0.08
Sparsity: 68.6 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1638400 | Self Similarity: 0.08
Sparsity: 67.7 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1638400 | Self Similarity: 0.07
Sparsity: 67.8 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.02 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1638400 | Self Similarity: 0.08
Sparsity: 67.6 | Dead Features: 0 | Tota

  2%|▏         | 805/48828 [01:13<1:14:16, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 807/48828 [01:13<1:13:54, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 811/48828 [01:14<1:13:41, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 813/48828 [01:14<1:14:27, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 817/48828 [01:14<1:13:51, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 819/48828 [01:15<1:13:26, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 823/48828 [01:15<1:13:21, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 825/48828 [01:15<1:13:42, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 829/48828 [01:15<1:13:32, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 831/48828 [01:16<1:13:26, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 835/48828 [01:16<1:13:15, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 837/48828 [01:16<1:13:31, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 841/48828 [01:17<1:13:23, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 843/48828 [01:17<1:13:17, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 847/48828 [01:17<1:13:02, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 849/48828 [01:17<1:13:16, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 853/48828 [01:18<1:13:20, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 855/48828 [01:18<1:13:13, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 859/48828 [01:18<1:13:00, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 861/48828 [01:18<1:13:30, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 865/48828 [01:19<1:13:27, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 867/48828 [01:19<1:13:19, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 871/48828 [01:19<1:13:02, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 873/48828 [01:20<1:13:20, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 877/48828 [01:20<1:13:21, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 879/48828 [01:20<1:13:10, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 883/48828 [01:20<1:13:02, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 885/48828 [01:21<1:13:19, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 889/48828 [01:21<1:13:20, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 891/48828 [01:21<1:13:09, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 895/48828 [01:22<1:12:58, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 897/48828 [01:22<1:13:18, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 901/48828 [01:22<1:14:43, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 57.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1843200 | Self Similarity: 0.08
Sparsity: 57.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1843200 | Self Similarity: 0.08
Sparsity: 58.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1843200 | Self Similarity: 0.07
Sparsity: 57.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1843200 | Self Similarity: 0.08
Sparsity: 57.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 1843200 | Self Similarity: 0.08
Sparsity: 57.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  2%|▏         | 903/48828 [01:22<1:14:26, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 907/48828 [01:23<1:13:30, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 909/48828 [01:23<1:13:22, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 913/48828 [01:23<1:13:49, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 915/48828 [01:23<1:13:31, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 919/48828 [01:24<1:13:29, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 921/48828 [01:24<1:13:31, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 925/48828 [01:24<1:15:31, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 927/48828 [01:25<1:15:35, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 931/48828 [01:25<1:14:19, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 933/48828 [01:25<1:13:49, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 937/48828 [01:25<1:13:23, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 939/48828 [01:26<1:14:24, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 943/48828 [01:26<1:13:49, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 945/48828 [01:26<1:13:28, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 949/48828 [01:27<1:13:49, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 951/48828 [01:27<1:14:12, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 955/48828 [01:27<1:14:12, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 957/48828 [01:27<1:13:55, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 961/48828 [01:28<1:13:53, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 963/48828 [01:28<1:13:51, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 967/48828 [01:28<1:14:21, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 969/48828 [01:28<1:14:23, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 973/48828 [01:29<1:13:51, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 975/48828 [01:29<1:13:37, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 979/48828 [01:29<1:13:41, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 981/48828 [01:30<1:14:11, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 985/48828 [01:30<1:14:00, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 987/48828 [01:30<1:13:41, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 991/48828 [01:30<1:13:29, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 993/48828 [01:31<1:13:39, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 997/48828 [01:31<1:13:37, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 999/48828 [01:31<1:13:26, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 64.0 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2048000 | Self Similarity: 0.08


  2%|▏         | 1001/48828 [01:31<1:14:49, 10.65it/s]

Sparsity: 64.7 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2048000 | Self Similarity: 0.08
Sparsity: 65.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2048000 | Self Similarity: 0.07
Sparsity: 64.4 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2048000 | Self Similarity: 0.07
Sparsity: 64.5 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2048000 | Self Similarity: 0.08
Sparsity: 64.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2048000 | Self Similarity: 0.08
Sparsity: 64.6 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2048000 | Self Similarity: 0.08
Sparsity: 64.6 | Dead Features: 0 | Tota

  2%|▏         | 1005/48828 [01:32<1:13:43, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1007/48828 [01:32<1:14:09, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1011/48828 [01:32<1:14:09, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1013/48828 [01:32<1:13:48, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1017/48828 [01:33<1:13:07, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1019/48828 [01:33<1:13:12, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1023/48828 [01:33<1:13:43, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1025/48828 [01:34<1:13:23, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1029/48828 [01:34<1:12:52, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1031/48828 [01:34<1:12:55, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1035/48828 [01:35<1:13:44, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1037/48828 [01:35<1:13:30, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1041/48828 [01:35<1:12:59, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1043/48828 [01:35<1:12:56, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1047/48828 [01:36<1:14:13, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1049/48828 [01:36<1:14:07, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1053/48828 [01:36<1:13:13, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1055/48828 [01:36<1:13:01, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1059/48828 [01:37<1:13:36, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1061/48828 [01:37<1:13:49, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1065/48828 [01:37<1:13:11, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1067/48828 [01:37<1:12:57, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1071/48828 [01:38<1:13:21, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1073/48828 [01:38<1:13:53, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1077/48828 [01:38<1:13:27, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1079/48828 [01:39<1:13:07, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1083/48828 [01:39<1:13:06, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1085/48828 [01:39<1:13:43, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1089/48828 [01:39<1:13:23, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1091/48828 [01:40<1:13:02, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1095/48828 [01:40<1:13:07, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1097/48828 [01:40<1:13:29, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1101/48828 [01:41<1:14:59, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 62.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2252800 | Self Similarity: 0.08
Sparsity: 63.0 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2252800 | Self Similarity: 0.08
Sparsity: 63.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2252800 | Self Similarity: 0.07
Sparsity: 62.7 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2252800 | Self Similarity: 0.07
Sparsity: 62.7 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2252800 | Self Similarity: 0.08
Sparsity: 62.5 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  2%|▏         | 1103/48828 [01:41<1:14:18, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1107/48828 [01:41<1:13:24, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1109/48828 [01:41<1:13:13, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1113/48828 [01:42<1:13:38, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1115/48828 [01:42<1:13:53, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1119/48828 [01:42<1:13:13, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1121/48828 [01:42<1:13:10, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1125/48828 [01:43<1:13:46, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1127/48828 [01:43<1:13:44, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1131/48828 [01:43<1:13:19, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1133/48828 [01:44<1:13:09, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1137/48828 [01:44<1:13:50, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1139/48828 [01:44<1:14:08, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1143/48828 [01:44<1:13:20, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1145/48828 [01:45<1:13:05, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1149/48828 [01:45<1:13:11, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1151/48828 [01:45<1:13:33, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1155/48828 [01:46<1:13:13, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1157/48828 [01:46<1:13:11, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1161/48828 [01:46<1:13:04, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1163/48828 [01:46<1:13:17, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1167/48828 [01:47<1:13:17, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1169/48828 [01:47<1:13:15, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1173/48828 [01:47<1:13:04, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1175/48828 [01:47<1:13:21, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1179/48828 [01:48<1:13:30, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1181/48828 [01:48<1:13:32, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1185/48828 [01:48<1:13:04, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1187/48828 [01:49<1:13:01, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1191/48828 [01:49<1:13:39, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1193/48828 [01:49<1:13:27, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1197/48828 [01:49<1:13:04, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1199/48828 [01:50<1:13:00, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1201/48828 [01:50<1:14:56, 10.59it/s]

Sparsity: 59.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2457600 | Self Similarity: 0.08
Sparsity: 60.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2457600 | Self Similarity: 0.08
Sparsity: 60.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2457600 | Self Similarity: 0.07
Sparsity: 59.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2457600 | Self Similarity: 0.07
Sparsity: 60.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2457600 | Self Similarity: 0.08
Sparsity: 59.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2457600 | Self Similarity: 0.08
Sparsity: 60.0 | Dead Features: 0 | Tota

  2%|▏         | 1205/48828 [01:50<1:14:25, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1207/48828 [01:50<1:14:10, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1211/48828 [01:51<1:13:28, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1213/48828 [01:51<1:13:16, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1217/48828 [01:51<1:13:27, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  2%|▏         | 1219/48828 [01:52<1:13:48, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1223/48828 [01:52<1:13:15, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1225/48828 [01:52<1:13:03, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1229/48828 [01:52<1:13:20, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1231/48828 [01:53<1:13:36, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1235/48828 [01:53<1:13:24, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1237/48828 [01:53<1:13:16, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1241/48828 [01:54<1:13:03, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1243/48828 [01:54<1:13:14, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1247/48828 [01:54<1:13:23, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1249/48828 [01:54<1:13:21, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1253/48828 [01:55<1:12:54, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1255/48828 [01:55<1:13:13, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1259/48828 [01:55<1:14:07, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1261/48828 [01:55<1:13:58, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1265/48828 [01:56<1:13:58, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1267/48828 [01:56<1:13:37, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1271/48828 [01:56<1:13:24, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1273/48828 [01:57<1:13:38, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1277/48828 [01:57<1:12:56, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1279/48828 [01:57<1:12:42, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1283/48828 [01:57<1:13:07, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1285/48828 [01:58<1:13:14, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1289/48828 [01:58<1:12:56, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1291/48828 [01:58<1:12:43, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1295/48828 [01:59<1:12:57, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1297/48828 [01:59<1:13:27, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1301/48828 [01:59<1:14:42, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 59.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2662400 | Self Similarity: 0.08
Sparsity: 59.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2662400 | Self Similarity: 0.08
Sparsity: 59.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2662400 | Self Similarity: 0.07
Sparsity: 58.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2662400 | Self Similarity: 0.07
Sparsity: 58.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2662400 | Self Similarity: 0.08
Sparsity: 59.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  3%|▎         | 1303/48828 [01:59<1:13:57, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1307/48828 [02:00<1:13:03, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1309/48828 [02:00<1:13:05, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1313/48828 [02:00<1:13:20, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1315/48828 [02:00<1:13:03, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1319/48828 [02:01<1:12:37, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1321/48828 [02:01<1:12:43, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1325/48828 [02:01<1:13:14, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1327/48828 [02:01<1:13:04, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1331/48828 [02:02<1:12:43, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1333/48828 [02:02<1:12:43, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1337/48828 [02:02<1:13:17, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1339/48828 [02:03<1:13:18, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1343/48828 [02:03<1:12:51, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1345/48828 [02:03<1:12:35, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1349/48828 [02:04<1:12:56, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1351/48828 [02:04<1:13:16, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1355/48828 [02:04<1:12:48, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1357/48828 [02:04<1:12:35, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1361/48828 [02:05<1:12:56, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1363/48828 [02:05<1:13:28, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1367/48828 [02:05<1:12:51, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1369/48828 [02:05<1:12:36, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1373/48828 [02:06<1:12:47, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1375/48828 [02:06<1:13:08, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1379/48828 [02:06<1:12:40, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1381/48828 [02:06<1:12:33, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1385/48828 [02:07<1:12:30, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1387/48828 [02:07<1:12:51, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1391/48828 [02:07<1:12:37, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1393/48828 [02:08<1:12:33, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1397/48828 [02:08<1:12:35, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1399/48828 [02:08<1:12:57, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1401/48828 [02:08<1:14:45, 10.57it/s]

Sparsity: 55.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2867200 | Self Similarity: 0.08
Sparsity: 55.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2867200 | Self Similarity: 0.08
Sparsity: 56.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2867200 | Self Similarity: 0.07
Sparsity: 55.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2867200 | Self Similarity: 0.07
Sparsity: 55.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2867200 | Self Similarity: 0.08
Sparsity: 55.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 2867200 | Self Similarity: 0.08
Sparsity: 55.5 | Dead Features: 0 | Tota

  3%|▎         | 1405/48828 [02:09<1:13:47, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1407/48828 [02:09<1:13:21, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1411/48828 [02:09<1:13:19, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1413/48828 [02:09<1:13:02, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1417/48828 [02:10<1:12:57, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1419/48828 [02:10<1:12:37, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1423/48828 [02:10<1:12:20, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1425/48828 [02:11<1:12:16, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1429/48828 [02:11<1:12:31, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1431/48828 [02:11<1:12:18, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1435/48828 [02:11<1:12:08, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1437/48828 [02:12<1:12:13, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1441/48828 [02:12<1:12:35, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1443/48828 [02:12<1:12:18, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1447/48828 [02:13<1:12:02, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1449/48828 [02:13<1:12:21, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1453/48828 [02:13<1:13:02, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1455/48828 [02:13<1:12:41, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1459/48828 [02:14<1:12:18, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1461/48828 [02:14<1:12:18, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1465/48828 [02:14<1:12:52, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1467/48828 [02:14<1:12:35, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1471/48828 [02:15<1:12:12, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1473/48828 [02:15<1:12:21, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1477/48828 [02:15<1:12:58, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1479/48828 [02:15<1:12:37, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1483/48828 [02:16<1:12:13, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1485/48828 [02:16<1:12:12, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1489/48828 [02:16<1:12:47, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1491/48828 [02:17<1:12:27, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1495/48828 [02:17<1:12:34, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1497/48828 [02:17<1:12:27, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1501/48828 [02:18<1:14:26, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 57.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3072000 | Self Similarity: 0.08
Sparsity: 57.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3072000 | Self Similarity: 0.08
Sparsity: 57.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3072000 | Self Similarity: 0.07
Sparsity: 57.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3072000 | Self Similarity: 0.07
Sparsity: 57.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3072000 | Self Similarity: 0.08
Sparsity: 57.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  3%|▎         | 1503/48828 [02:18<1:14:13, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1507/48828 [02:18<1:12:59, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1509/48828 [02:18<1:12:39, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1513/48828 [02:19<1:12:27, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1515/48828 [02:19<1:12:41, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1519/48828 [02:19<1:12:17, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1521/48828 [02:19<1:12:07, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1525/48828 [02:20<1:11:59, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1527/48828 [02:20<1:12:22, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1531/48828 [02:20<1:12:04, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1533/48828 [02:20<1:11:55, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1537/48828 [02:21<1:12:13, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1539/48828 [02:21<1:12:40, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1543/48828 [02:21<1:12:20, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1545/48828 [02:22<1:12:10, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1549/48828 [02:22<1:12:29, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1551/48828 [02:22<1:12:45, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1555/48828 [02:22<1:12:18, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1557/48828 [02:23<1:11:57, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1561/48828 [02:23<1:12:00, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1563/48828 [02:23<1:12:28, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1567/48828 [02:24<1:12:08, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1569/48828 [02:24<1:11:58, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1573/48828 [02:24<1:12:15, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1575/48828 [02:24<1:12:38, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1579/48828 [02:25<1:12:12, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1581/48828 [02:25<1:12:05, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1585/48828 [02:25<1:12:05, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1587/48828 [02:25<1:12:29, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1591/48828 [02:26<1:12:08, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1593/48828 [02:26<1:11:58, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1597/48828 [02:26<1:12:20, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1599/48828 [02:27<1:13:03, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1601/48828 [02:27<1:15:21, 10.44it/s]

Sparsity: 57.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3276800 | Self Similarity: 0.08
Sparsity: 58.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3276800 | Self Similarity: 0.08
Sparsity: 58.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3276800 | Self Similarity: 0.08
Sparsity: 58.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3276800 | Self Similarity: 0.07
Sparsity: 58.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3276800 | Self Similarity: 0.08
Sparsity: 57.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3276800 | Self Similarity: 0.08
Sparsity: 58.4 | Dead Features: 0 | Tota

  3%|▎         | 1605/48828 [02:27<1:16:20, 10.31it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1607/48828 [02:27<1:16:19, 10.31it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1611/48828 [02:28<1:14:42, 10.53it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1613/48828 [02:28<1:14:07, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1617/48828 [02:28<1:13:39, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1619/48828 [02:28<1:14:11, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1623/48828 [02:29<1:14:41, 10.53it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1625/48828 [02:29<1:14:18, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1629/48828 [02:29<1:13:18, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1631/48828 [02:30<1:12:56, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1635/48828 [02:30<1:13:09, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1637/48828 [02:30<1:13:26, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1641/48828 [02:30<1:12:47, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1643/48828 [02:31<1:12:31, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1647/48828 [02:31<1:12:36, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1649/48828 [02:31<1:13:04, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1653/48828 [02:32<1:13:00, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1655/48828 [02:32<1:12:46, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1659/48828 [02:32<1:12:38, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1661/48828 [02:32<1:12:44, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1665/48828 [02:33<1:12:49, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1667/48828 [02:33<1:12:35, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1671/48828 [02:33<1:12:04, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1673/48828 [02:33<1:12:24, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1677/48828 [02:34<1:12:52, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1679/48828 [02:34<1:12:45, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1683/48828 [02:34<1:12:17, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1685/48828 [02:35<1:12:09, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1689/48828 [02:35<1:12:34, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1691/48828 [02:35<1:12:30, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1695/48828 [02:35<1:12:07, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1697/48828 [02:36<1:12:01, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1701/48828 [02:36<1:14:18, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 55.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3481600 | Self Similarity: 0.08
Sparsity: 55.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3481600 | Self Similarity: 0.08
Sparsity: 55.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3481600 | Self Similarity: 0.08
Sparsity: 55.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3481600 | Self Similarity: 0.07
Sparsity: 55.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3481600 | Self Similarity: 0.08
Sparsity: 55.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  3%|▎         | 1703/48828 [02:36<1:14:15, 10.58it/s]


representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  3%|▎         | 1705/48828 [02:36<1:14:07, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1709/48828 [02:37<1:12:52, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1711/48828 [02:37<1:12:30, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1715/48828 [02:37<1:12:21, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1717/48828 [02:38<1:12:55, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1721/48828 [02:38<1:12:26, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1723/48828 [02:38<1:12:08, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1727/48828 [02:38<1:12:11, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1729/48828 [02:39<1:12:29, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1733/48828 [02:39<1:12:28, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1735/48828 [02:39<1:12:18, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1739/48828 [02:40<1:12:00, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1741/48828 [02:40<1:12:14, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1745/48828 [02:40<1:13:17, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1747/48828 [02:40<1:13:12, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1751/48828 [02:41<1:12:29, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1753/48828 [02:41<1:12:29, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1757/48828 [02:41<1:12:46, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1759/48828 [02:41<1:12:39, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1763/48828 [02:42<1:12:08, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1765/48828 [02:42<1:11:59, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1769/48828 [02:42<1:12:24, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1771/48828 [02:42<1:12:24, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1775/48828 [02:43<1:12:07, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1777/48828 [02:43<1:11:57, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1781/48828 [02:43<1:12:18, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1783/48828 [02:44<1:12:29, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1787/48828 [02:44<1:12:00, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1789/48828 [02:44<1:11:47, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1793/48828 [02:45<1:12:26, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1795/48828 [02:45<1:12:51, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1799/48828 [02:45<1:12:23, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1801/48828 [02:45<1:13:46, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
Sparsity: 57.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3686400 | Self Similarity: 0.08
Sparsity: 58.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3686400 | Self Similarity: 0.08
Sparsity: 58.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3686400 | Self Similarity: 0.08
Sparsity: 58.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3686400 | Self Similarity: 0.08
Sparsity: 58.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3686400 | Self Similarity: 0.08
Sparsity: 58.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3686400 | Self Simila

  4%|▎         | 1805/48828 [02:46<1:12:48, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1807/48828 [02:46<1:13:05, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1811/48828 [02:46<1:13:26, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1813/48828 [02:46<1:13:01, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1817/48828 [02:47<1:12:22, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1819/48828 [02:47<1:12:20, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1823/48828 [02:47<1:12:51, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1825/48828 [02:48<1:12:53, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1829/48828 [02:48<1:12:19, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▎         | 1831/48828 [02:48<1:12:07, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1835/48828 [02:48<1:12:30, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1837/48828 [02:49<1:12:49, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1841/48828 [02:49<1:12:21, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1843/48828 [02:49<1:12:10, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1847/48828 [02:50<1:12:37, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1849/48828 [02:50<1:12:52, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1853/48828 [02:50<1:12:45, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1855/48828 [02:50<1:12:28, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1859/48828 [02:51<1:12:01, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1861/48828 [02:51<1:12:18, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1865/48828 [02:51<1:13:48, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1867/48828 [02:51<1:14:18, 10.53it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1871/48828 [02:52<1:13:09, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1873/48828 [02:52<1:12:45, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1877/48828 [02:52<1:12:39, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1879/48828 [02:53<1:12:53, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1883/48828 [02:53<1:12:34, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1885/48828 [02:53<1:12:28, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1889/48828 [02:53<1:12:13, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1891/48828 [02:54<1:12:36, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1895/48828 [02:54<1:12:52, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1897/48828 [02:54<1:12:28, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1901/48828 [02:55<1:13:42, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 56.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3891200 | Self Similarity: 0.08
Sparsity: 57.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3891200 | Self Similarity: 0.08
Sparsity: 57.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3891200 | Self Similarity: 0.08
Sparsity: 57.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3891200 | Self Similarity: 0.08
Sparsity: 57.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 3891200 | Self Similarity: 0.08
Sparsity: 57.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  4%|▍         | 1903/48828 [02:55<1:13:08, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1907/48828 [02:55<1:13:16, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1909/48828 [02:55<1:13:29, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1913/48828 [02:56<1:12:56, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1915/48828 [02:56<1:12:41, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1919/48828 [02:56<1:12:08, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1921/48828 [02:56<1:12:24, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1925/48828 [02:57<1:12:28, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1927/48828 [02:57<1:12:10, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1931/48828 [02:57<1:11:49, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1933/48828 [02:58<1:11:57, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1937/48828 [02:58<1:12:23, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1939/48828 [02:58<1:12:15, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1943/48828 [02:58<1:12:56, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1945/48828 [02:59<1:12:48, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1949/48828 [02:59<1:13:28, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1951/48828 [02:59<1:13:58, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1955/48828 [03:00<1:13:53, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1957/48828 [03:00<1:13:27, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1961/48828 [03:00<1:12:48, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1963/48828 [03:00<1:12:44, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1967/48828 [03:01<1:13:26, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1969/48828 [03:01<1:13:59, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1973/48828 [03:01<1:13:36, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1975/48828 [03:01<1:13:21, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1979/48828 [03:02<1:12:56, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1981/48828 [03:02<1:13:01, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1985/48828 [03:02<1:13:50, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1987/48828 [03:03<1:14:01, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1991/48828 [03:03<1:13:11, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1993/48828 [03:03<1:12:58, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1997/48828 [03:04<1:12:43, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 1999/48828 [03:04<1:13:31, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2001/48828 [03:04<1:15:38, 10.32it/s]

Sparsity: 55.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4096000 | Self Similarity: 0.08
Sparsity: 54.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4096000 | Self Similarity: 0.08
Sparsity: 55.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4096000 | Self Similarity: 0.08
Sparsity: 55.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4096000 | Self Similarity: 0.08
Sparsity: 55.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4096000 | Self Similarity: 0.08
Sparsity: 55.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4096000 | Self Similarity: 0.08
Sparsity: 55.3 | Dead Features: 0 | Tota

  4%|▍         | 2005/48828 [03:04<1:15:08, 10.39it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2007/48828 [03:05<1:14:45, 10.44it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2011/48828 [03:05<1:13:40, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2013/48828 [03:05<1:13:15, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2017/48828 [03:05<1:12:50, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2019/48828 [03:06<1:13:22, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2023/48828 [03:06<1:14:21, 10.49it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2025/48828 [03:06<1:13:57, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2029/48828 [03:07<1:13:14, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2031/48828 [03:07<1:12:53, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2035/48828 [03:07<1:12:51, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2037/48828 [03:07<1:13:42, 10.58it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2041/48828 [03:08<1:14:24, 10.48it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2043/48828 [03:08<1:13:54, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2047/48828 [03:08<1:13:27, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2049/48828 [03:08<1:13:11, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2053/48828 [03:09<1:13:09, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2055/48828 [03:09<1:13:43, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2059/48828 [03:09<1:14:24, 10.48it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2061/48828 [03:10<1:13:59, 10.54it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2065/48828 [03:10<1:13:03, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2067/48828 [03:10<1:12:47, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2071/48828 [03:11<1:13:01, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2073/48828 [03:11<1:13:53, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2077/48828 [03:11<1:13:57, 10.53it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2079/48828 [03:11<1:13:30, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2083/48828 [03:12<1:12:49, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2085/48828 [03:12<1:12:45, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2089/48828 [03:12<1:13:16, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2091/48828 [03:12<1:13:42, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2095/48828 [03:13<1:13:30, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2097/48828 [03:13<1:13:06, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2101/48828 [03:13<1:14:17, 10.48it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 54.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4300800 | Self Similarity: 0.08
Sparsity: 54.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4300800 | Self Similarity: 0.08
Sparsity: 55.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4300800 | Self Similarity: 0.08
Sparsity: 54.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4300800 | Self Similarity: 0.08
Sparsity: 54.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4300800 | Self Similarity: 0.08
Sparsity: 54.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  4%|▍         | 2103/48828 [03:14<1:13:39, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2105/48828 [03:14<1:13:22, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2109/48828 [03:14<1:13:57, 10.53it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2111/48828 [03:14<1:14:05, 10.51it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2115/48828 [03:15<1:13:41, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2117/48828 [03:15<1:13:11, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2121/48828 [03:15<1:12:35, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2123/48828 [03:15<1:12:49, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2127/48828 [03:16<1:13:45, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2129/48828 [03:16<1:13:45, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2133/48828 [03:16<1:12:58, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2135/48828 [03:17<1:12:46, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2139/48828 [03:17<1:12:38, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2141/48828 [03:17<1:13:05, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2145/48828 [03:18<1:13:33, 10.58it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2147/48828 [03:18<1:13:16, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2151/48828 [03:18<1:12:37, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2153/48828 [03:18<1:12:28, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2157/48828 [03:19<1:13:02, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2159/48828 [03:19<1:13:27, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2163/48828 [03:19<1:13:08, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2165/48828 [03:19<1:12:46, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2169/48828 [03:20<1:12:32, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2171/48828 [03:20<1:12:35, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2175/48828 [03:20<1:13:36, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2177/48828 [03:21<1:13:52, 10.52it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2181/48828 [03:21<1:13:09, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2183/48828 [03:21<1:12:55, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2187/48828 [03:21<1:12:34, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2189/48828 [03:22<1:12:44, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2193/48828 [03:22<1:13:54, 10.52it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  4%|▍         | 2195/48828 [03:22<1:13:57, 10.51it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2199/48828 [03:23<1:13:29, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2201/48828 [03:23<1:14:57, 10.37it/s]

representation shape is: torch.Size([8, 256, 512])
Sparsity: 50.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4505600 | Self Similarity: 0.08
Sparsity: 51.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4505600 | Self Similarity: 0.08
Sparsity: 51.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4505600 | Self Similarity: 0.08
Sparsity: 51.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4505600 | Self Similarity: 0.08
Sparsity: 51.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4505600 | Self Similarity: 0.08
Sparsity: 51.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4505600 | Self Simila

  5%|▍         | 2203/48828 [03:23<1:14:03, 10.49it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2207/48828 [03:23<1:13:28, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2209/48828 [03:24<1:13:25, 10.58it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2213/48828 [03:24<1:14:03, 10.49it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2215/48828 [03:24<1:14:36, 10.41it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2219/48828 [03:24<1:13:50, 10.52it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2221/48828 [03:25<1:13:20, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2225/48828 [03:25<1:12:50, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2227/48828 [03:25<1:12:46, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2231/48828 [03:26<1:13:27, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2233/48828 [03:26<1:13:43, 10.53it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2237/48828 [03:26<1:12:57, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2239/48828 [03:26<1:12:36, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2243/48828 [03:27<1:12:23, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2245/48828 [03:27<1:12:48, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2249/48828 [03:27<1:13:38, 10.54it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2251/48828 [03:28<1:13:32, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2255/48828 [03:28<1:12:48, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2257/48828 [03:28<1:12:27, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2261/48828 [03:28<1:12:35, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2263/48828 [03:29<1:13:00, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2267/48828 [03:29<1:13:29, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2269/48828 [03:29<1:13:09, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2273/48828 [03:30<1:12:40, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2275/48828 [03:30<1:12:43, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2279/48828 [03:30<1:12:59, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2281/48828 [03:30<1:13:13, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2285/48828 [03:31<1:13:03, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2287/48828 [03:31<1:12:43, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2291/48828 [03:31<1:12:11, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2293/48828 [03:31<1:12:16, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2297/48828 [03:32<1:13:03, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2299/48828 [03:32<1:13:19, 10.58it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2301/48828 [03:32<1:14:38, 10.39it/s]

Sparsity: 52.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4710400 | Self Similarity: 0.08
Sparsity: 52.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4710400 | Self Similarity: 0.08
Sparsity: 52.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4710400 | Self Similarity: 0.08
Sparsity: 53.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4710400 | Self Similarity: 0.08
Sparsity: 53.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4710400 | Self Similarity: 0.08
Sparsity: 52.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4710400 | Self Similarity: 0.08
Sparsity: 52.6 | Dead Features: 0 | Tota

  5%|▍         | 2305/48828 [03:33<1:13:15, 10.58it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2307/48828 [03:33<1:13:13, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2311/48828 [03:33<1:12:30, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2313/48828 [03:33<1:12:46, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2317/48828 [03:34<1:13:18, 10.58it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2319/48828 [03:34<1:12:56, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2323/48828 [03:34<1:12:41, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2325/48828 [03:34<1:12:44, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2329/48828 [03:35<1:12:45, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2331/48828 [03:35<1:13:26, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2335/48828 [03:35<1:12:53, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2337/48828 [03:36<1:12:22, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2341/48828 [03:36<1:11:34, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2343/48828 [03:36<1:11:26, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2347/48828 [03:37<1:12:05, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2349/48828 [03:37<1:11:44, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2353/48828 [03:37<1:11:14, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2355/48828 [03:37<1:11:57, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2359/48828 [03:38<1:12:01, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2361/48828 [03:38<1:12:12, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2365/48828 [03:38<1:11:39, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2367/48828 [03:38<1:11:23, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2371/48828 [03:39<1:11:12, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2373/48828 [03:39<1:11:42, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2377/48828 [03:39<1:11:15, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2379/48828 [03:39<1:11:05, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2383/48828 [03:40<1:11:14, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2385/48828 [03:40<1:11:30, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2389/48828 [03:40<1:11:19, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2391/48828 [03:41<1:11:07, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2395/48828 [03:41<1:11:36, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2397/48828 [03:41<1:11:40, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2401/48828 [03:42<1:13:17, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 56.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4915200 | Self Similarity: 0.08
Sparsity: 57.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4915200 | Self Similarity: 0.08
Sparsity: 57.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4915200 | Self Similarity: 0.08
Sparsity: 57.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4915200 | Self Similarity: 0.08
Sparsity: 57.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 4915200 | Self Similarity: 0.08
Sparsity: 57.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  5%|▍         | 2403/48828 [03:42<1:12:36, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2407/48828 [03:42<1:11:38, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2409/48828 [03:42<1:11:45, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2413/48828 [03:43<1:11:52, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2415/48828 [03:43<1:11:46, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2419/48828 [03:43<1:11:10, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2421/48828 [03:43<1:11:12, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2425/48828 [03:44<1:11:52, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2427/48828 [03:44<1:12:12, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2431/48828 [03:44<1:11:54, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2433/48828 [03:44<1:11:35, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2437/48828 [03:45<1:11:23, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▍         | 2439/48828 [03:45<1:11:48, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2443/48828 [03:45<1:11:56, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2445/48828 [03:46<1:11:37, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2449/48828 [03:46<1:11:15, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2451/48828 [03:46<1:11:18, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2455/48828 [03:47<1:12:00, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2457/48828 [03:47<1:11:48, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2461/48828 [03:47<1:11:20, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2463/48828 [03:47<1:11:20, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2467/48828 [03:48<1:11:42, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2469/48828 [03:48<1:11:49, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2473/48828 [03:48<1:11:13, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2475/48828 [03:48<1:10:58, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2479/48828 [03:49<1:11:32, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2481/48828 [03:49<1:11:47, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2485/48828 [03:49<1:11:20, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2487/48828 [03:49<1:11:12, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2491/48828 [03:50<1:11:01, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2493/48828 [03:50<1:11:19, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2497/48828 [03:50<1:11:17, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2499/48828 [03:51<1:11:08, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 50.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5120000 | Self Similarity: 0.08


  5%|▌         | 2501/48828 [03:51<1:12:43, 10.62it/s]

Sparsity: 51.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5120000 | Self Similarity: 0.08
Sparsity: 50.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5120000 | Self Similarity: 0.08
Sparsity: 51.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5120000 | Self Similarity: 0.08
Sparsity: 50.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5120000 | Self Similarity: 0.08
Sparsity: 50.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5120000 | Self Similarity: 0.08
Sparsity: 50.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5120000 | Self Similarity: 0.08
Sparsity: 51.0 | Dead Features: 0 | Tota

  5%|▌         | 2505/48828 [03:51<1:11:45, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2507/48828 [03:51<1:12:16, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2511/48828 [03:52<1:11:54, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2513/48828 [03:52<1:11:23, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2517/48828 [03:52<1:10:55, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2519/48828 [03:52<1:11:10, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2523/48828 [03:53<1:11:30, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2525/48828 [03:53<1:11:21, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2529/48828 [03:53<1:10:49, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2531/48828 [03:54<1:10:51, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2535/48828 [03:54<1:11:29, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2537/48828 [03:54<1:11:22, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2541/48828 [03:54<1:10:48, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2543/48828 [03:55<1:10:41, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2547/48828 [03:55<1:11:27, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2549/48828 [03:55<1:11:25, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2553/48828 [03:56<1:10:59, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2555/48828 [03:56<1:10:49, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2559/48828 [03:56<1:11:29, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2561/48828 [03:56<1:11:56, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2565/48828 [03:57<1:11:19, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2567/48828 [03:57<1:11:11, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2571/48828 [03:57<1:11:05, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2573/48828 [03:57<1:11:33, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2577/48828 [03:58<1:11:17, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2579/48828 [03:58<1:11:18, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2583/48828 [03:58<1:11:03, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2585/48828 [03:59<1:11:14, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2589/48828 [03:59<1:11:30, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2591/48828 [03:59<1:11:12, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2595/48828 [03:59<1:10:42, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2597/48828 [04:00<1:11:22, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2601/48828 [04:00<1:13:05, 10.54it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 52.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5324800 | Self Similarity: 0.08
Sparsity: 52.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5324800 | Self Similarity: 0.08
Sparsity: 52.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5324800 | Self Similarity: 0.08
Sparsity: 53.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5324800 | Self Similarity: 0.08
Sparsity: 52.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5324800 | Self Similarity: 0.08
Sparsity: 52.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  5%|▌         | 2603/48828 [04:00<1:12:56, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2607/48828 [04:01<1:11:54, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2609/48828 [04:01<1:12:42, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2613/48828 [04:01<1:12:39, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2615/48828 [04:01<1:12:26, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2619/48828 [04:02<1:12:10, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2621/48828 [04:02<1:12:10, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2625/48828 [04:02<1:11:18, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2627/48828 [04:02<1:11:04, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2631/48828 [04:03<1:11:11, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2633/48828 [04:03<1:11:24, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2637/48828 [04:03<1:10:51, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2639/48828 [04:04<1:10:41, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2643/48828 [04:04<1:11:09, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2645/48828 [04:04<1:11:19, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2649/48828 [04:04<1:10:59, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2651/48828 [04:05<1:10:46, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2655/48828 [04:05<1:11:03, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2657/48828 [04:05<1:11:19, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2661/48828 [04:06<1:11:18, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2663/48828 [04:06<1:11:03, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2667/48828 [04:06<1:10:48, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2669/48828 [04:06<1:10:58, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2673/48828 [04:07<1:11:02, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2675/48828 [04:07<1:10:53, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2679/48828 [04:07<1:10:43, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2681/48828 [04:07<1:10:50, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  5%|▌         | 2685/48828 [04:08<1:11:11, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2687/48828 [04:08<1:10:55, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2691/48828 [04:08<1:10:33, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2693/48828 [04:09<1:10:34, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2697/48828 [04:09<1:11:02, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2699/48828 [04:09<1:10:47, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 52.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5529600 | Self Similarity: 0.08


  6%|▌         | 2701/48828 [04:09<1:12:21, 10.62it/s]

Sparsity: 52.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5529600 | Self Similarity: 0.08
Sparsity: 52.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5529600 | Self Similarity: 0.08
Sparsity: 53.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5529600 | Self Similarity: 0.08
Sparsity: 53.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5529600 | Self Similarity: 0.08
Sparsity: 53.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5529600 | Self Similarity: 0.08
Sparsity: 52.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5529600 | Self Similarity: 0.08
Sparsity: 53.2 | Dead Features: 0 | Tota

  6%|▌         | 2705/48828 [04:10<1:11:27, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2707/48828 [04:10<1:11:17, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2711/48828 [04:10<1:11:42, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2713/48828 [04:10<1:11:33, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2717/48828 [04:11<1:10:56, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2719/48828 [04:11<1:11:30, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2723/48828 [04:11<1:11:05, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2725/48828 [04:12<1:11:11, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2729/48828 [04:12<1:10:50, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2731/48828 [04:12<1:10:47, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2735/48828 [04:12<1:11:04, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2737/48828 [04:13<1:11:15, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2741/48828 [04:13<1:11:09, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2743/48828 [04:13<1:11:45, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2747/48828 [04:14<1:11:02, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2749/48828 [04:14<1:11:01, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2753/48828 [04:14<1:11:21, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2755/48828 [04:14<1:11:12, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2759/48828 [04:15<1:10:43, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2761/48828 [04:15<1:10:44, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2765/48828 [04:15<1:12:00, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2767/48828 [04:15<1:11:50, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2771/48828 [04:16<1:11:03, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2773/48828 [04:16<1:10:42, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2777/48828 [04:16<1:11:20, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2779/48828 [04:17<1:11:41, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2783/48828 [04:17<1:11:11, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2785/48828 [04:17<1:10:53, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2789/48828 [04:17<1:10:59, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2791/48828 [04:18<1:10:50, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2795/48828 [04:18<1:10:52, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2797/48828 [04:18<1:10:41, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2801/48828 [04:19<1:12:15, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 57.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5734400 | Self Similarity: 0.08
Sparsity: 57.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5734400 | Self Similarity: 0.08
Sparsity: 57.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5734400 | Self Similarity: 0.08
Sparsity: 57.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5734400 | Self Similarity: 0.08
Sparsity: 58.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5734400 | Self Similarity: 0.08
Sparsity: 57.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  6%|▌         | 2803/48828 [04:19<1:11:32, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2807/48828 [04:19<1:11:30, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2809/48828 [04:19<1:11:14, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2813/48828 [04:20<1:10:39, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2815/48828 [04:20<1:10:28, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2819/48828 [04:20<1:11:00, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2821/48828 [04:20<1:12:20, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2825/48828 [04:21<1:11:11, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2827/48828 [04:21<1:10:47, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2831/48828 [04:21<1:10:31, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2833/48828 [04:22<1:10:49, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2837/48828 [04:22<1:10:41, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2839/48828 [04:22<1:10:31, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2843/48828 [04:22<1:10:14, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2845/48828 [04:23<1:10:34, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2849/48828 [04:23<1:10:45, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2851/48828 [04:23<1:10:34, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2855/48828 [04:24<1:10:13, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2857/48828 [04:24<1:10:21, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2861/48828 [04:24<1:10:31, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2863/48828 [04:24<1:10:23, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2867/48828 [04:25<1:10:04, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2869/48828 [04:25<1:10:16, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2873/48828 [04:25<1:10:37, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2875/48828 [04:25<1:10:22, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2879/48828 [04:26<1:10:05, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2881/48828 [04:26<1:10:14, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2885/48828 [04:26<1:10:36, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2887/48828 [04:26<1:10:21, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2891/48828 [04:27<1:10:03, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2893/48828 [04:27<1:10:06, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2897/48828 [04:27<1:10:29, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2899/48828 [04:28<1:10:10, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 52.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5939200 | Self Similarity: 0.09


  6%|▌         | 2901/48828 [04:28<1:11:39, 10.68it/s]

Sparsity: 51.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5939200 | Self Similarity: 0.08
Sparsity: 52.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5939200 | Self Similarity: 0.08
Sparsity: 52.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5939200 | Self Similarity: 0.08
Sparsity: 52.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5939200 | Self Similarity: 0.08
Sparsity: 52.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5939200 | Self Similarity: 0.08
Sparsity: 52.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 5939200 | Self Similarity: 0.08
Sparsity: 52.4 | Dead Features: 0 | Tota

  6%|▌         | 2905/48828 [04:28<1:10:46, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2907/48828 [04:28<1:10:40, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2911/48828 [04:29<1:10:43, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2913/48828 [04:29<1:10:28, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2917/48828 [04:29<1:10:07, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2919/48828 [04:29<1:10:11, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2923/48828 [04:30<1:10:33, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2925/48828 [04:30<1:10:19, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2929/48828 [04:30<1:09:53, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2931/48828 [04:31<1:09:56, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2935/48828 [04:31<1:10:22, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2937/48828 [04:31<1:10:10, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2941/48828 [04:31<1:09:58, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2943/48828 [04:32<1:10:04, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2947/48828 [04:32<1:10:53, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2949/48828 [04:32<1:10:41, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2953/48828 [04:33<1:10:41, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2955/48828 [04:33<1:10:23, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2959/48828 [04:33<1:10:49, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2961/48828 [04:33<1:10:41, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2965/48828 [04:34<1:10:05, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2967/48828 [04:34<1:10:01, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2971/48828 [04:34<1:10:31, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2973/48828 [04:34<1:10:32, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2977/48828 [04:35<1:10:53, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2979/48828 [04:35<1:10:35, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2983/48828 [04:35<1:10:28, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2985/48828 [04:36<1:10:56, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2989/48828 [04:36<1:10:20, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2991/48828 [04:36<1:10:08, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2995/48828 [04:36<1:10:06, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 2997/48828 [04:37<1:10:34, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3001/48828 [04:37<1:12:08, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 49.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6144000 | Self Similarity: 0.09
Sparsity: 49.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6144000 | Self Similarity: 0.08
Sparsity: 49.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6144000 | Self Similarity: 0.08
Sparsity: 49.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6144000 | Self Similarity: 0.08
Sparsity: 50.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6144000 | Self Similarity: 0.08
Sparsity: 49.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  6%|▌         | 3003/48828 [04:37<1:11:24, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3007/48828 [04:38<1:10:48, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3009/48828 [04:38<1:10:39, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3013/48828 [04:38<1:10:51, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3015/48828 [04:38<1:10:31, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3019/48828 [04:39<1:09:57, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3021/48828 [04:39<1:10:12, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3025/48828 [04:39<1:11:03, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3027/48828 [04:39<1:10:51, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3031/48828 [04:40<1:10:06, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3033/48828 [04:40<1:10:04, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3037/48828 [04:40<1:10:25, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3039/48828 [04:41<1:10:35, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3043/48828 [04:41<1:09:57, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3045/48828 [04:41<1:09:49, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3049/48828 [04:41<1:10:11, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▌         | 3051/48828 [04:42<1:10:18, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3055/48828 [04:42<1:09:47, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3057/48828 [04:42<1:09:40, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3061/48828 [04:43<1:09:51, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3063/48828 [04:43<1:09:50, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3067/48828 [04:43<1:09:32, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3069/48828 [04:43<1:09:54, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3073/48828 [04:44<1:10:03, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3075/48828 [04:44<1:10:03, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3079/48828 [04:44<1:09:35, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3081/48828 [04:44<1:09:32, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3085/48828 [04:45<1:10:03, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3087/48828 [04:45<1:10:05, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3091/48828 [04:45<1:09:45, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3093/48828 [04:45<1:09:59, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3097/48828 [04:46<1:11:48, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3099/48828 [04:46<1:11:18, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3101/48828 [04:46<1:12:26, 10.52it/s]

Sparsity: 54.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6348800 | Self Similarity: 0.09
Sparsity: 54.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6348800 | Self Similarity: 0.08
Sparsity: 54.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6348800 | Self Similarity: 0.08
Sparsity: 54.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6348800 | Self Similarity: 0.08
Sparsity: 54.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6348800 | Self Similarity: 0.08
Sparsity: 54.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6348800 | Self Similarity: 0.09
Sparsity: 54.1 | Dead Features: 0 | Tota

  6%|▋         | 3105/48828 [04:47<1:10:56, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3107/48828 [04:47<1:10:23, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3111/48828 [04:47<1:10:09, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3113/48828 [04:47<1:10:35, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3117/48828 [04:48<1:10:17, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3119/48828 [04:48<1:10:05, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3123/48828 [04:48<1:09:46, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3125/48828 [04:48<1:10:09, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3129/48828 [04:49<1:09:59, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3131/48828 [04:49<1:09:45, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3135/48828 [04:49<1:09:43, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3137/48828 [04:50<1:09:57, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3141/48828 [04:50<1:10:02, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3143/48828 [04:50<1:09:51, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3147/48828 [04:50<1:10:12, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3149/48828 [04:51<1:10:17, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3153/48828 [04:51<1:10:17, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3155/48828 [04:51<1:09:52, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3159/48828 [04:52<1:10:03, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3161/48828 [04:52<1:10:05, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3165/48828 [04:52<1:10:26, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3167/48828 [04:52<1:10:04, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3171/48828 [04:53<1:09:53, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  6%|▋         | 3173/48828 [04:53<1:09:57, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3177/48828 [04:53<1:10:45, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3179/48828 [04:53<1:10:25, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3183/48828 [04:54<1:09:49, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3185/48828 [04:54<1:09:57, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3189/48828 [04:54<1:10:25, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3191/48828 [04:54<1:10:23, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3195/48828 [04:55<1:09:54, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3197/48828 [04:55<1:09:47, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3201/48828 [04:55<1:11:51, 10.58it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 51.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6553600 | Self Similarity: 0.09
Sparsity: 51.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6553600 | Self Similarity: 0.08
Sparsity: 51.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6553600 | Self Similarity: 0.09
Sparsity: 51.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6553600 | Self Similarity: 0.08
Sparsity: 51.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6553600 | Self Similarity: 0.09
Sparsity: 51.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  7%|▋         | 3203/48828 [04:56<1:11:27, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3207/48828 [04:56<1:10:40, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3209/48828 [04:56<1:10:23, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3213/48828 [04:57<1:09:54, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3215/48828 [04:57<1:10:00, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3219/48828 [04:57<1:10:05, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3221/48828 [04:57<1:09:57, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3225/48828 [04:58<1:10:06, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3227/48828 [04:58<1:10:18, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3231/48828 [04:58<1:10:11, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3233/48828 [04:58<1:09:56, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3237/48828 [04:59<1:09:36, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3239/48828 [04:59<1:09:47, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3243/48828 [04:59<1:09:55, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3245/48828 [04:59<1:09:37, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3249/48828 [05:00<1:09:34, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3251/48828 [05:00<1:09:57, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3255/48828 [05:00<1:10:10, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3257/48828 [05:01<1:09:52, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3261/48828 [05:01<1:09:40, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3263/48828 [05:01<1:09:52, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3267/48828 [05:02<1:10:11, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3269/48828 [05:02<1:10:03, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3273/48828 [05:02<1:10:06, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3275/48828 [05:02<1:09:50, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3279/48828 [05:03<1:09:57, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3281/48828 [05:03<1:09:58, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3285/48828 [05:03<1:09:25, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3287/48828 [05:03<1:09:24, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3291/48828 [05:04<1:09:51, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3293/48828 [05:04<1:10:02, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3297/48828 [05:04<1:09:46, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3299/48828 [05:04<1:09:35, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 54.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6758400 | Self Similarity: 0.09


  7%|▋         | 3301/48828 [05:05<1:11:07, 10.67it/s]

Sparsity: 54.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6758400 | Self Similarity: 0.08
Sparsity: 54.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6758400 | Self Similarity: 0.09
Sparsity: 54.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6758400 | Self Similarity: 0.08
Sparsity: 54.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6758400 | Self Similarity: 0.09
Sparsity: 54.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6758400 | Self Similarity: 0.09
Sparsity: 54.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6758400 | Self Similarity: 0.08
Sparsity: 54.2 | Dead Features: 0 | Tota

  7%|▋         | 3305/48828 [05:05<1:10:40, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3307/48828 [05:05<1:10:55, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3311/48828 [05:06<1:10:01, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3313/48828 [05:06<1:09:40, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3317/48828 [05:06<1:10:07, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3319/48828 [05:06<1:10:09, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3323/48828 [05:07<1:09:27, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3325/48828 [05:07<1:09:17, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3329/48828 [05:07<1:09:26, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3331/48828 [05:07<1:09:37, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3335/48828 [05:08<1:09:19, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3337/48828 [05:08<1:09:09, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3341/48828 [05:08<1:09:31, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3343/48828 [05:09<1:09:38, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3347/48828 [05:09<1:09:15, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3349/48828 [05:09<1:09:14, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3353/48828 [05:09<1:09:27, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3355/48828 [05:10<1:09:34, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3359/48828 [05:10<1:09:17, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3361/48828 [05:10<1:09:40, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3365/48828 [05:11<1:09:50, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3367/48828 [05:11<1:09:53, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3371/48828 [05:11<1:09:29, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3373/48828 [05:11<1:09:10, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3377/48828 [05:12<1:09:20, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3379/48828 [05:12<1:09:35, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3383/48828 [05:12<1:09:13, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3385/48828 [05:12<1:09:06, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3389/48828 [05:13<1:09:27, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3391/48828 [05:13<1:09:50, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3395/48828 [05:13<1:09:23, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3397/48828 [05:13<1:09:12, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3401/48828 [05:14<1:11:07, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 52.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6963200 | Self Similarity: 0.09
Sparsity: 52.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6963200 | Self Similarity: 0.09
Sparsity: 53.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6963200 | Self Similarity: 0.09
Sparsity: 52.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6963200 | Self Similarity: 0.08
Sparsity: 52.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 6963200 | Self Similarity: 0.09
Sparsity: 52.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  7%|▋         | 3403/48828 [05:14<1:10:48, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3407/48828 [05:14<1:09:55, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3409/48828 [05:15<1:09:29, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3413/48828 [05:15<1:09:24, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3415/48828 [05:15<1:09:32, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3419/48828 [05:15<1:09:12, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3421/48828 [05:16<1:09:00, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3425/48828 [05:16<1:09:12, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3427/48828 [05:16<1:09:36, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3431/48828 [05:17<1:09:13, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3433/48828 [05:17<1:09:05, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3437/48828 [05:17<1:09:24, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3439/48828 [05:17<1:09:38, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3443/48828 [05:18<1:09:22, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3445/48828 [05:18<1:09:10, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3449/48828 [05:18<1:09:12, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3451/48828 [05:18<1:09:35, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3455/48828 [05:19<1:09:11, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3457/48828 [05:19<1:09:04, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3461/48828 [05:19<1:09:12, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3463/48828 [05:20<1:09:31, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3467/48828 [05:20<1:09:05, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3469/48828 [05:20<1:08:57, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3473/48828 [05:20<1:09:12, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3475/48828 [05:21<1:09:22, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3479/48828 [05:21<1:09:00, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3481/48828 [05:21<1:08:54, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3485/48828 [05:22<1:09:13, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3487/48828 [05:22<1:09:28, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3491/48828 [05:22<1:08:58, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3493/48828 [05:22<1:08:51, 10.97it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3497/48828 [05:23<1:09:50, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3499/48828 [05:23<1:10:03, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 50.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7168000 | Self Similarity: 0.09


  7%|▋         | 3501/48828 [05:23<1:11:10, 10.61it/s]

Sparsity: 50.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7168000 | Self Similarity: 0.09
Sparsity: 51.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7168000 | Self Similarity: 0.09
Sparsity: 51.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7168000 | Self Similarity: 0.08
Sparsity: 51.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7168000 | Self Similarity: 0.09
Sparsity: 51.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7168000 | Self Similarity: 0.09
Sparsity: 50.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7168000 | Self Similarity: 0.09
Sparsity: 51.2 | Dead Features: 0 | Tota

  7%|▋         | 3505/48828 [05:23<1:09:48, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3507/48828 [05:24<1:09:27, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3511/48828 [05:24<1:09:35, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3513/48828 [05:24<1:09:38, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3517/48828 [05:24<1:09:26, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3519/48828 [05:25<1:09:13, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3523/48828 [05:25<1:09:31, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3525/48828 [05:25<1:09:58, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3529/48828 [05:26<1:09:34, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3531/48828 [05:26<1:09:13, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3535/48828 [05:26<1:09:25, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3537/48828 [05:26<1:09:33, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3541/48828 [05:27<1:09:08, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3543/48828 [05:27<1:08:55, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3547/48828 [05:27<1:09:20, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3549/48828 [05:27<1:09:41, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3553/48828 [05:28<1:09:11, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3555/48828 [05:28<1:08:55, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3559/48828 [05:28<1:08:57, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3561/48828 [05:29<1:09:29, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3565/48828 [05:29<1:09:11, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3567/48828 [05:29<1:09:23, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3571/48828 [05:29<1:09:32, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3573/48828 [05:30<1:09:36, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3577/48828 [05:30<1:09:23, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3579/48828 [05:30<1:09:07, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3583/48828 [05:31<1:08:49, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3585/48828 [05:31<1:09:09, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3589/48828 [05:31<1:08:57, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3591/48828 [05:31<1:08:48, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3595/48828 [05:32<1:09:11, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3597/48828 [05:32<1:09:22, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3601/48828 [05:32<1:10:30, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 46.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7372800 | Self Similarity: 0.09
Sparsity: 46.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7372800 | Self Similarity: 0.09
Sparsity: 47.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7372800 | Self Similarity: 0.09
Sparsity: 46.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7372800 | Self Similarity: 0.09
Sparsity: 46.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7372800 | Self Similarity: 0.09
Sparsity: 47.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  7%|▋         | 3603/48828 [05:32<1:09:55, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3607/48828 [05:33<1:09:09, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3609/48828 [05:33<1:09:19, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3613/48828 [05:33<1:09:14, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3615/48828 [05:33<1:09:06, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3619/48828 [05:34<1:08:54, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3621/48828 [05:34<1:09:13, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3625/48828 [05:34<1:09:45, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3627/48828 [05:35<1:09:24, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3631/48828 [05:35<1:08:58, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3633/48828 [05:35<1:09:42, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3637/48828 [05:36<1:10:27, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3639/48828 [05:36<1:10:35, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3643/48828 [05:36<1:09:27, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3645/48828 [05:36<1:09:12, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3649/48828 [05:37<1:09:14, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3651/48828 [05:37<1:09:22, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3655/48828 [05:37<1:08:54, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3657/48828 [05:37<1:08:53, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  7%|▋         | 3661/48828 [05:38<1:09:11, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3663/48828 [05:38<1:09:20, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3667/48828 [05:38<1:08:53, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3669/48828 [05:38<1:08:51, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3673/48828 [05:39<1:09:19, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3675/48828 [05:39<1:09:35, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3679/48828 [05:39<1:09:03, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3681/48828 [05:40<1:08:55, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3685/48828 [05:40<1:09:10, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3687/48828 [05:40<1:09:20, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3691/48828 [05:40<1:08:48, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3693/48828 [05:41<1:08:38, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3697/48828 [05:41<1:08:57, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3699/48828 [05:41<1:09:13, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 45.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7577600 | Self Similarity: 0.09


  8%|▊         | 3701/48828 [05:41<1:10:35, 10.65it/s]

Sparsity: 45.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7577600 | Self Similarity: 0.09
Sparsity: 46.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7577600 | Self Similarity: 0.09
Sparsity: 46.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7577600 | Self Similarity: 0.09
Sparsity: 46.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7577600 | Self Similarity: 0.09
Sparsity: 46.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7577600 | Self Similarity: 0.09
Sparsity: 46.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7577600 | Self Similarity: 0.09
Sparsity: 45.9 | Dead Features: 0 | Tota

  8%|▊         | 3705/48828 [05:42<1:09:27, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3707/48828 [05:42<1:09:30, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3711/48828 [05:42<1:09:26, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3713/48828 [05:43<1:09:33, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3717/48828 [05:43<1:08:55, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3719/48828 [05:43<1:08:41, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3723/48828 [05:43<1:09:00, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3725/48828 [05:44<1:09:15, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3729/48828 [05:44<1:08:54, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3731/48828 [05:44<1:08:43, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3735/48828 [05:45<1:08:52, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3737/48828 [05:45<1:09:07, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3741/48828 [05:45<1:08:41, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3743/48828 [05:45<1:08:38, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3747/48828 [05:46<1:08:59, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3749/48828 [05:46<1:09:24, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3753/48828 [05:46<1:08:58, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3755/48828 [05:46<1:08:42, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3759/48828 [05:47<1:08:59, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3761/48828 [05:47<1:09:24, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3765/48828 [05:47<1:08:50, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3767/48828 [05:47<1:08:44, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3771/48828 [05:48<1:08:57, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3773/48828 [05:48<1:09:14, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3777/48828 [05:48<1:08:49, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3779/48828 [05:49<1:08:38, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3783/48828 [05:49<1:08:50, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3785/48828 [05:49<1:09:06, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3789/48828 [05:49<1:08:49, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3791/48828 [05:50<1:08:35, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3795/48828 [05:50<1:08:58, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3797/48828 [05:50<1:09:10, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3801/48828 [05:51<1:10:24, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 47.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7782400 | Self Similarity: 0.09
Sparsity: 47.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7782400 | Self Similarity: 0.09
Sparsity: 48.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7782400 | Self Similarity: 0.09
Sparsity: 48.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7782400 | Self Similarity: 0.09
Sparsity: 47.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7782400 | Self Similarity: 0.09
Sparsity: 48.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  8%|▊         | 3803/48828 [05:51<1:09:49, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3807/48828 [05:51<1:09:04, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3809/48828 [05:51<1:09:05, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3813/48828 [05:52<1:09:02, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3815/48828 [05:52<1:08:49, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3819/48828 [05:52<1:08:33, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3821/48828 [05:52<1:08:58, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3825/48828 [05:53<1:09:03, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3827/48828 [05:53<1:08:56, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3831/48828 [05:53<1:08:34, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3833/48828 [05:54<1:08:43, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3837/48828 [05:54<1:09:14, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3839/48828 [05:54<1:09:01, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3843/48828 [05:54<1:08:40, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3845/48828 [05:55<1:08:38, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3849/48828 [05:55<1:08:53, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3851/48828 [05:55<1:08:44, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3855/48828 [05:56<1:08:27, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3857/48828 [05:56<1:08:33, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3861/48828 [05:56<1:08:55, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3863/48828 [05:56<1:08:45, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3867/48828 [05:57<1:08:29, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3869/48828 [05:57<1:08:40, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3873/48828 [05:57<1:08:54, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3875/48828 [05:57<1:08:38, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3879/48828 [05:58<1:08:25, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3881/48828 [05:58<1:08:36, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3885/48828 [05:58<1:08:52, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3887/48828 [05:58<1:08:37, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3891/48828 [05:59<1:08:29, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3893/48828 [05:59<1:08:35, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3897/48828 [05:59<1:08:56, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3899/48828 [06:00<1:08:43, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 48.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7987200 | Self Similarity: 0.09


  8%|▊         | 3901/48828 [06:00<1:10:06, 10.68it/s]

Sparsity: 48.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7987200 | Self Similarity: 0.09
Sparsity: 48.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7987200 | Self Similarity: 0.09
Sparsity: 48.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7987200 | Self Similarity: 0.09
Sparsity: 48.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7987200 | Self Similarity: 0.09
Sparsity: 48.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7987200 | Self Similarity: 0.09
Sparsity: 48.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 7987200 | Self Similarity: 0.09
Sparsity: 48.4 | Dead Features: 0 | Tota

  8%|▊         | 3905/48828 [06:00<1:09:13, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3907/48828 [06:00<1:09:14, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3911/48828 [06:01<1:09:10, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3913/48828 [06:01<1:08:52, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3917/48828 [06:01<1:08:45, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3919/48828 [06:01<1:08:48, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3923/48828 [06:02<1:09:02, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3925/48828 [06:02<1:08:54, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3929/48828 [06:02<1:08:36, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3931/48828 [06:03<1:08:37, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3935/48828 [06:03<1:09:07, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3937/48828 [06:03<1:08:55, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3941/48828 [06:03<1:08:37, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3943/48828 [06:04<1:08:37, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3947/48828 [06:04<1:09:13, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3949/48828 [06:04<1:09:03, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3953/48828 [06:05<1:08:44, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3955/48828 [06:05<1:08:38, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3959/48828 [06:05<1:09:16, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3961/48828 [06:05<1:09:12, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3965/48828 [06:06<1:08:37, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3967/48828 [06:06<1:08:35, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3971/48828 [06:06<1:08:53, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3973/48828 [06:06<1:09:03, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3977/48828 [06:07<1:08:35, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3979/48828 [06:07<1:08:34, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3983/48828 [06:07<1:08:58, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3985/48828 [06:08<1:09:13, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3989/48828 [06:08<1:08:58, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3991/48828 [06:08<1:08:45, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3995/48828 [06:08<1:08:52, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 3997/48828 [06:09<1:09:11, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4001/48828 [06:09<1:10:27, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 51.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8192000 | Self Similarity: 0.09
Sparsity: 51.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8192000 | Self Similarity: 0.09
Sparsity: 52.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8192000 | Self Similarity: 0.09
Sparsity: 51.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8192000 | Self Similarity: 0.09
Sparsity: 51.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8192000 | Self Similarity: 0.09
Sparsity: 51.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  8%|▊         | 4003/48828 [06:09<1:09:45, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4007/48828 [06:10<1:09:03, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4009/48828 [06:10<1:08:56, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4013/48828 [06:10<1:09:14, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4015/48828 [06:10<1:08:56, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4019/48828 [06:11<1:08:34, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4021/48828 [06:11<1:08:36, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4025/48828 [06:11<1:09:03, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4027/48828 [06:11<1:08:56, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4031/48828 [06:12<1:08:34, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4033/48828 [06:12<1:08:32, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4037/48828 [06:12<1:09:03, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4039/48828 [06:12<1:08:58, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4043/48828 [06:13<1:08:33, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4045/48828 [06:13<1:08:29, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4049/48828 [06:13<1:08:55, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4051/48828 [06:14<1:09:18, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4055/48828 [06:14<1:08:49, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4057/48828 [06:14<1:08:38, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4061/48828 [06:15<1:08:53, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4063/48828 [06:15<1:09:06, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4067/48828 [06:15<1:08:50, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4069/48828 [06:15<1:08:39, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4073/48828 [06:16<1:08:30, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4075/48828 [06:16<1:08:52, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4079/48828 [06:16<1:08:36, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4081/48828 [06:16<1:08:31, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4085/48828 [06:17<1:08:23, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4087/48828 [06:17<1:08:52, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4091/48828 [06:17<1:08:41, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4093/48828 [06:17<1:08:47, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4097/48828 [06:18<1:08:32, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4099/48828 [06:18<1:08:46, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4101/48828 [06:18<1:10:40, 10.55it/s]

Sparsity: 50.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8396800 | Self Similarity: 0.10
Sparsity: 50.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8396800 | Self Similarity: 0.09
Sparsity: 50.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8396800 | Self Similarity: 0.09
Sparsity: 50.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8396800 | Self Similarity: 0.09
Sparsity: 51.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8396800 | Self Similarity: 0.09
Sparsity: 50.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8396800 | Self Similarity: 0.09
Sparsity: 51.0 | Dead Features: 0 | Tota

  8%|▊         | 4105/48828 [06:19<1:09:59, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4107/48828 [06:19<1:09:24, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4111/48828 [06:19<1:08:43, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4113/48828 [06:19<1:08:42, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4117/48828 [06:20<1:09:10, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4119/48828 [06:20<1:08:56, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4123/48828 [06:20<1:08:32, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4125/48828 [06:20<1:08:58, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4129/48828 [06:21<1:09:08, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4131/48828 [06:21<1:09:07, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4135/48828 [06:21<1:08:35, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4137/48828 [06:22<1:08:25, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4141/48828 [06:22<1:08:41, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4143/48828 [06:22<1:08:46, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4147/48828 [06:22<1:08:23, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  8%|▊         | 4149/48828 [06:23<1:08:16, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4153/48828 [06:23<1:08:38, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4155/48828 [06:23<1:08:45, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4159/48828 [06:24<1:08:25, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4161/48828 [06:24<1:08:17, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4165/48828 [06:24<1:08:28, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4167/48828 [06:24<1:08:44, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4171/48828 [06:25<1:08:27, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4173/48828 [06:25<1:08:21, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4177/48828 [06:25<1:08:19, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4179/48828 [06:25<1:08:41, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4183/48828 [06:26<1:08:30, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4185/48828 [06:26<1:08:26, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4189/48828 [06:26<1:08:18, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4191/48828 [06:27<1:08:33, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4195/48828 [06:27<1:08:33, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4197/48828 [06:27<1:08:25, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4201/48828 [06:27<1:09:43, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 44.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8601600 | Self Similarity: 0.10
Sparsity: 44.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8601600 | Self Similarity: 0.09
Sparsity: 44.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8601600 | Self Similarity: 0.09
Sparsity: 44.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8601600 | Self Similarity: 0.09
Sparsity: 44.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8601600 | Self Similarity: 0.10
Sparsity: 44.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  9%|▊         | 4203/48828 [06:28<1:09:14, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4207/48828 [06:28<1:09:16, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4209/48828 [06:28<1:08:58, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4213/48828 [06:29<1:08:27, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4215/48828 [06:29<1:08:20, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4219/48828 [06:29<1:08:48, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4221/48828 [06:29<1:08:39, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4225/48828 [06:30<1:08:10, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4227/48828 [06:30<1:08:09, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4231/48828 [06:30<1:08:40, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4233/48828 [06:30<1:08:40, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4237/48828 [06:31<1:08:19, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4239/48828 [06:31<1:08:12, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4243/48828 [06:31<1:08:32, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4245/48828 [06:32<1:08:42, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4249/48828 [06:32<1:08:18, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4251/48828 [06:32<1:08:07, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4255/48828 [06:32<1:08:24, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4257/48828 [06:33<1:08:36, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4261/48828 [06:33<1:08:22, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4263/48828 [06:33<1:08:10, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4267/48828 [06:34<1:08:16, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▊         | 4269/48828 [06:34<1:08:33, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4273/48828 [06:34<1:08:10, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4275/48828 [06:34<1:08:00, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4279/48828 [06:35<1:08:09, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4281/48828 [06:35<1:08:31, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4285/48828 [06:35<1:08:14, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4287/48828 [06:35<1:08:10, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4291/48828 [06:36<1:08:04, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4293/48828 [06:36<1:08:28, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4297/48828 [06:36<1:08:49, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4299/48828 [06:36<1:08:29, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 54.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8806400 | Self Similarity: 0.10


  9%|▉         | 4301/48828 [06:37<1:09:56, 10.61it/s]

Sparsity: 54.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8806400 | Self Similarity: 0.09
Sparsity: 55.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8806400 | Self Similarity: 0.10
Sparsity: 54.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8806400 | Self Similarity: 0.09
Sparsity: 54.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8806400 | Self Similarity: 0.10
Sparsity: 54.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8806400 | Self Similarity: 0.10
Sparsity: 54.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 8806400 | Self Similarity: 0.09
Sparsity: 54.7 | Dead Features: 0 | Tota

  9%|▉         | 4305/48828 [06:37<1:08:55, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4307/48828 [06:37<1:08:49, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4311/48828 [06:38<1:08:28, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4313/48828 [06:38<1:08:15, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4317/48828 [06:38<1:07:59, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4319/48828 [06:38<1:08:14, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4323/48828 [06:39<1:08:29, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4325/48828 [06:39<1:08:23, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4329/48828 [06:39<1:08:06, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4331/48828 [06:39<1:08:09, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4335/48828 [06:40<1:08:33, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4337/48828 [06:40<1:08:21, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4341/48828 [06:40<1:07:59, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4343/48828 [06:41<1:08:00, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4347/48828 [06:41<1:08:26, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4349/48828 [06:41<1:08:14, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4353/48828 [06:41<1:07:55, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4355/48828 [06:42<1:07:51, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4359/48828 [06:42<1:08:30, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4361/48828 [06:42<1:08:23, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4365/48828 [06:43<1:07:59, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4367/48828 [06:43<1:07:56, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4371/48828 [06:43<1:08:25, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4373/48828 [06:43<1:08:22, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4377/48828 [06:44<1:08:00, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4379/48828 [06:44<1:07:55, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4383/48828 [06:44<1:08:20, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4385/48828 [06:44<1:08:47, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4389/48828 [06:45<1:08:11, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4391/48828 [06:45<1:08:04, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4395/48828 [06:45<1:08:28, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4397/48828 [06:46<1:08:46, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4401/48828 [06:46<1:09:50, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 50.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9011200 | Self Similarity: 0.10
Sparsity: 50.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9011200 | Self Similarity: 0.10
Sparsity: 50.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9011200 | Self Similarity: 0.10
Sparsity: 50.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9011200 | Self Similarity: 0.09
Sparsity: 50.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9011200 | Self Similarity: 0.10
Sparsity: 50.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  9%|▉         | 4403/48828 [06:46<1:09:09, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4407/48828 [06:46<1:08:24, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4409/48828 [06:47<1:08:29, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4413/48828 [06:47<1:08:30, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4415/48828 [06:47<1:08:09, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4419/48828 [06:48<1:07:48, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4421/48828 [06:48<1:07:51, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4425/48828 [06:48<1:08:16, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4427/48828 [06:48<1:08:03, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4431/48828 [06:49<1:07:40, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4433/48828 [06:49<1:07:49, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4437/48828 [06:49<1:08:17, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4439/48828 [06:49<1:08:05, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4443/48828 [06:50<1:07:44, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4445/48828 [06:50<1:07:50, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4449/48828 [06:50<1:08:21, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4451/48828 [06:50<1:08:07, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4455/48828 [06:51<1:07:45, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4457/48828 [06:51<1:07:42, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4461/48828 [06:51<1:08:21, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4463/48828 [06:52<1:08:08, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4467/48828 [06:52<1:07:45, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4469/48828 [06:52<1:07:46, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4473/48828 [06:53<1:08:53, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4475/48828 [06:53<1:08:56, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4479/48828 [06:53<1:08:16, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4481/48828 [06:53<1:07:58, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4485/48828 [06:54<1:07:53, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4487/48828 [06:54<1:08:02, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4491/48828 [06:54<1:07:45, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4493/48828 [06:54<1:07:36, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4497/48828 [06:55<1:07:50, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4499/48828 [06:55<1:08:09, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 49.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9216000 | Self Similarity: 0.10


  9%|▉         | 4501/48828 [06:55<1:09:20, 10.65it/s]

Sparsity: 48.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9216000 | Self Similarity: 0.10
Sparsity: 49.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9216000 | Self Similarity: 0.10
Sparsity: 49.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9216000 | Self Similarity: 0.10
Sparsity: 49.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9216000 | Self Similarity: 0.10
Sparsity: 49.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9216000 | Self Similarity: 0.10
Sparsity: 49.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9216000 | Self Similarity: 0.10
Sparsity: 48.8 | Dead Features: 0 | Tota

  9%|▉         | 4505/48828 [06:55<1:08:15, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4507/48828 [06:56<1:08:01, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4511/48828 [06:56<1:08:05, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4513/48828 [06:56<1:08:21, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4517/48828 [06:57<1:07:48, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4519/48828 [06:57<1:07:44, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4523/48828 [06:57<1:07:44, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4525/48828 [06:57<1:07:58, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4529/48828 [06:58<1:07:34, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4531/48828 [06:58<1:07:31, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4535/48828 [06:58<1:07:44, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4537/48828 [06:58<1:08:03, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4541/48828 [06:59<1:07:44, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4543/48828 [06:59<1:07:33, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4547/48828 [06:59<1:07:55, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4549/48828 [07:00<1:08:13, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4553/48828 [07:00<1:07:47, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4555/48828 [07:00<1:07:33, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4559/48828 [07:00<1:07:40, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4561/48828 [07:01<1:07:57, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4565/48828 [07:01<1:07:39, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4567/48828 [07:01<1:07:28, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4571/48828 [07:02<1:07:42, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4573/48828 [07:02<1:07:54, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4577/48828 [07:02<1:07:42, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4579/48828 [07:02<1:07:36, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4583/48828 [07:03<1:07:33, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4585/48828 [07:03<1:07:48, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4589/48828 [07:03<1:07:33, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4591/48828 [07:03<1:07:31, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4595/48828 [07:04<1:07:32, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4597/48828 [07:04<1:07:43, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4601/48828 [07:04<1:08:58, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 44.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9420800 | Self Similarity: 0.10
Sparsity: 44.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9420800 | Self Similarity: 0.10
Sparsity: 45.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9420800 | Self Similarity: 0.10
Sparsity: 44.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9420800 | Self Similarity: 0.10
Sparsity: 44.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9420800 | Self Similarity: 0.10
Sparsity: 44.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

  9%|▉         | 4603/48828 [07:04<1:08:27, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4607/48828 [07:05<1:07:54, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4609/48828 [07:05<1:08:09, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4613/48828 [07:05<1:08:07, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4615/48828 [07:06<1:07:48, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4619/48828 [07:06<1:07:31, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4621/48828 [07:06<1:07:35, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4625/48828 [07:06<1:07:46, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4627/48828 [07:07<1:07:38, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4631/48828 [07:07<1:07:30, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4633/48828 [07:07<1:07:32, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


  9%|▉         | 4637/48828 [07:08<1:07:52, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4639/48828 [07:08<1:07:37, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4643/48828 [07:08<1:07:25, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4645/48828 [07:08<1:07:43, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4649/48828 [07:09<1:07:55, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4651/48828 [07:09<1:07:43, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4655/48828 [07:09<1:07:26, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4657/48828 [07:09<1:07:25, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4661/48828 [07:10<1:07:44, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4663/48828 [07:10<1:07:30, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4667/48828 [07:10<1:07:31, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4669/48828 [07:11<1:07:47, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4673/48828 [07:11<1:08:04, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4675/48828 [07:11<1:07:56, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4679/48828 [07:11<1:07:31, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4681/48828 [07:12<1:07:25, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4685/48828 [07:12<1:07:56, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4687/48828 [07:12<1:08:01, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4691/48828 [07:13<1:07:31, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4693/48828 [07:13<1:07:20, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4697/48828 [07:13<1:07:43, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4699/48828 [07:13<1:07:45, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 46.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9625600 | Self Similarity: 0.10


 10%|▉         | 4701/48828 [07:13<1:08:58, 10.66it/s]

Sparsity: 46.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9625600 | Self Similarity: 0.10
Sparsity: 46.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9625600 | Self Similarity: 0.10
Sparsity: 46.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9625600 | Self Similarity: 0.10
Sparsity: 46.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9625600 | Self Similarity: 0.10
Sparsity: 46.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9625600 | Self Similarity: 0.10
Sparsity: 46.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9625600 | Self Similarity: 0.10
Sparsity: 46.5 | Dead Features: 0 | Tota

 10%|▉         | 4705/48828 [07:14<1:08:00, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4707/48828 [07:14<1:07:48, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4711/48828 [07:14<1:08:12, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4713/48828 [07:15<1:08:09, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4717/48828 [07:15<1:07:37, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4719/48828 [07:15<1:07:25, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4723/48828 [07:16<1:07:44, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4725/48828 [07:16<1:07:50, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4729/48828 [07:16<1:07:27, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4731/48828 [07:16<1:07:18, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4735/48828 [07:17<1:07:22, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4737/48828 [07:17<1:07:33, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4741/48828 [07:17<1:07:14, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4743/48828 [07:17<1:07:06, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4747/48828 [07:18<1:07:23, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4749/48828 [07:18<1:07:37, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4753/48828 [07:18<1:07:57, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4755/48828 [07:18<1:07:40, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4759/48828 [07:19<1:07:39, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4761/48828 [07:19<1:07:46, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4765/48828 [07:19<1:07:30, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4767/48828 [07:20<1:07:22, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4771/48828 [07:20<1:07:21, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4773/48828 [07:20<1:07:44, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4777/48828 [07:20<1:07:25, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4779/48828 [07:21<1:07:08, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4783/48828 [07:21<1:07:08, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4785/48828 [07:21<1:07:32, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4789/48828 [07:22<1:07:18, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4791/48828 [07:22<1:07:04, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4795/48828 [07:22<1:07:11, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4797/48828 [07:22<1:07:40, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4801/48828 [07:23<1:08:44, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 47.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9830400 | Self Similarity: 0.10
Sparsity: 47.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9830400 | Self Similarity: 0.10
Sparsity: 47.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9830400 | Self Similarity: 0.10
Sparsity: 47.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9830400 | Self Similarity: 0.10
Sparsity: 47.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 9830400 | Self Similarity: 0.10
Sparsity: 48.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 |

 10%|▉         | 4803/48828 [07:23<1:08:09, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4807/48828 [07:23<1:07:20, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4809/48828 [07:23<1:07:25, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4813/48828 [07:24<1:07:40, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4815/48828 [07:24<1:07:24, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4819/48828 [07:24<1:07:08, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4821/48828 [07:25<1:07:14, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4825/48828 [07:25<1:07:20, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4827/48828 [07:25<1:07:12, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4831/48828 [07:25<1:06:59, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4833/48828 [07:26<1:07:29, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4837/48828 [07:26<1:07:49, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4839/48828 [07:26<1:07:38, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4843/48828 [07:27<1:07:10, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4845/48828 [07:27<1:07:11, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4849/48828 [07:27<1:07:33, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4851/48828 [07:27<1:07:21, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4855/48828 [07:28<1:07:02, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4857/48828 [07:28<1:07:07, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4861/48828 [07:28<1:07:33, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4863/48828 [07:28<1:07:21, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4867/48828 [07:29<1:06:57, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4869/48828 [07:29<1:07:12, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4873/48828 [07:29<1:07:47, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4875/48828 [07:29<1:07:31, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4879/48828 [07:30<1:07:08, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|▉         | 4881/48828 [07:30<1:07:08, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4885/48828 [07:30<1:07:41, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4887/48828 [07:31<1:07:28, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4891/48828 [07:31<1:07:31, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4893/48828 [07:31<1:07:27, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4897/48828 [07:32<1:07:41, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4899/48828 [07:32<1:07:37, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 42.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10035200 | Self Similarity: 0.10


 10%|█         | 4901/48828 [07:32<1:08:55, 10.62it/s]

Sparsity: 42.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10035200 | Self Similarity: 0.10
Sparsity: 42.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10035200 | Self Similarity: 0.10
Sparsity: 41.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10035200 | Self Similarity: 0.10
Sparsity: 42.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10035200 | Self Similarity: 0.10
Sparsity: 42.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10035200 | Self Similarity: 0.10
Sparsity: 42.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10035200 | Self Similarity: 0.10
Sparsity: 42.3 | Dead Features: 0 

 10%|█         | 4905/48828 [07:32<1:07:42, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4907/48828 [07:32<1:07:23, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4911/48828 [07:33<1:07:31, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4913/48828 [07:33<1:07:27, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4917/48828 [07:33<1:07:08, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4919/48828 [07:34<1:07:03, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4923/48828 [07:34<1:07:20, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4925/48828 [07:34<1:07:23, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4929/48828 [07:34<1:07:05, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4931/48828 [07:35<1:06:52, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4935/48828 [07:35<1:07:20, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4937/48828 [07:35<1:07:24, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4941/48828 [07:36<1:07:03, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4943/48828 [07:36<1:07:00, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4947/48828 [07:36<1:07:13, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4949/48828 [07:36<1:07:19, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4953/48828 [07:37<1:06:56, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4955/48828 [07:37<1:06:50, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4959/48828 [07:37<1:07:10, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4961/48828 [07:37<1:07:18, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4965/48828 [07:38<1:06:56, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4967/48828 [07:38<1:06:52, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4971/48828 [07:38<1:07:06, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4973/48828 [07:39<1:07:15, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4977/48828 [07:39<1:06:58, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4979/48828 [07:39<1:06:48, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4983/48828 [07:39<1:07:02, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4985/48828 [07:40<1:07:14, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4989/48828 [07:40<1:06:51, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4991/48828 [07:40<1:06:46, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4995/48828 [07:41<1:07:07, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 4997/48828 [07:41<1:07:13, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5001/48828 [07:41<1:08:21, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 44.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10240000 | Self Similarity: 0.10
Sparsity: 44.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10240000 | Self Similarity: 0.10
Sparsity: 44.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10240000 | Self Similarity: 0.10
Sparsity: 44.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10240000 | Self Similarity: 0.10
Sparsity: 44.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10240000 | Self Similarity: 0.10
Sparsity: 44.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 10%|█         | 5003/48828 [07:41<1:07:50, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5007/48828 [07:42<1:07:37, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5009/48828 [07:42<1:07:56, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5013/48828 [07:42<1:07:39, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5015/48828 [07:42<1:07:29, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5019/48828 [07:43<1:07:06, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5021/48828 [07:43<1:07:32, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5025/48828 [07:43<1:07:36, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5027/48828 [07:43<1:07:20, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5031/48828 [07:44<1:07:23, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5033/48828 [07:44<1:07:22, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5037/48828 [07:44<1:07:51, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5039/48828 [07:45<1:07:36, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5043/48828 [07:45<1:07:09, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5045/48828 [07:45<1:07:06, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5049/48828 [07:46<1:07:42, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5051/48828 [07:46<1:07:39, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5055/48828 [07:46<1:07:13, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5057/48828 [07:46<1:07:02, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5061/48828 [07:47<1:07:21, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5063/48828 [07:47<1:07:39, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5067/48828 [07:47<1:07:09, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5069/48828 [07:47<1:07:02, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5073/48828 [07:48<1:07:09, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5075/48828 [07:48<1:07:31, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5079/48828 [07:48<1:07:15, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5081/48828 [07:48<1:07:06, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5085/48828 [07:49<1:07:05, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5087/48828 [07:49<1:07:17, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5091/48828 [07:49<1:07:13, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5093/48828 [07:50<1:07:02, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5097/48828 [07:50<1:06:51, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5099/48828 [07:50<1:07:16, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5101/48828 [07:50<1:09:05, 10.55it/s]

Sparsity: 45.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10444800 | Self Similarity: 0.10
Sparsity: 45.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10444800 | Self Similarity: 0.10
Sparsity: 45.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10444800 | Self Similarity: 0.10
Sparsity: 45.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10444800 | Self Similarity: 0.10
Sparsity: 45.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10444800 | Self Similarity: 0.10
Sparsity: 45.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10444800 | Self Similarity: 0.10
Sparsity: 45.0 | Dead Features: 0 

 10%|█         | 5105/48828 [07:51<1:08:18, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5107/48828 [07:51<1:07:48, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5111/48828 [07:51<1:07:12, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5113/48828 [07:51<1:07:16, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5117/48828 [07:52<1:07:28, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5119/48828 [07:52<1:07:14, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5123/48828 [07:52<1:07:03, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 10%|█         | 5125/48828 [07:53<1:07:04, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5129/48828 [07:53<1:07:37, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5131/48828 [07:53<1:07:32, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5135/48828 [07:53<1:06:56, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5137/48828 [07:54<1:06:49, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5141/48828 [07:54<1:07:40, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5143/48828 [07:54<1:07:47, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5147/48828 [07:55<1:07:34, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5149/48828 [07:55<1:07:13, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5153/48828 [07:55<1:07:14, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5155/48828 [07:55<1:07:34, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5159/48828 [07:56<1:07:17, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5161/48828 [07:56<1:07:07, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5165/48828 [07:56<1:06:52, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5167/48828 [07:56<1:07:14, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5171/48828 [07:57<1:07:18, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5173/48828 [07:57<1:07:08, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5177/48828 [07:57<1:06:51, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5179/48828 [07:58<1:07:10, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5183/48828 [07:58<1:07:23, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5185/48828 [07:58<1:07:07, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5189/48828 [07:58<1:06:42, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5191/48828 [07:59<1:06:42, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5195/48828 [07:59<1:07:11, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5197/48828 [07:59<1:07:01, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5201/48828 [08:00<1:08:15, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 41.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10649600 | Self Similarity: 0.11
Sparsity: 41.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10649600 | Self Similarity: 0.10
Sparsity: 41.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10649600 | Self Similarity: 0.11
Sparsity: 41.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10649600 | Self Similarity: 0.10
Sparsity: 41.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10649600 | Self Similarity: 0.11
Sparsity: 41.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 11%|█         | 5203/48828 [08:00<1:07:42, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5207/48828 [08:00<1:07:25, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5209/48828 [08:00<1:07:30, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5213/48828 [08:01<1:06:59, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5215/48828 [08:01<1:06:45, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5219/48828 [08:01<1:06:52, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5221/48828 [08:01<1:07:11, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5225/48828 [08:02<1:06:45, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5227/48828 [08:02<1:06:34, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5231/48828 [08:02<1:06:38, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5233/48828 [08:03<1:06:58, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5237/48828 [08:03<1:06:51, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5239/48828 [08:03<1:06:44, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5243/48828 [08:03<1:06:44, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5245/48828 [08:04<1:06:54, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5249/48828 [08:04<1:06:47, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5251/48828 [08:04<1:06:33, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5255/48828 [08:05<1:06:35, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5257/48828 [08:05<1:06:55, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5261/48828 [08:05<1:06:49, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5263/48828 [08:05<1:06:38, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5267/48828 [08:06<1:06:29, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5269/48828 [08:06<1:06:55, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5273/48828 [08:06<1:06:56, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5275/48828 [08:06<1:06:47, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5279/48828 [08:07<1:06:34, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5281/48828 [08:07<1:06:51, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5285/48828 [08:07<1:06:58, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5287/48828 [08:07<1:06:46, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5291/48828 [08:08<1:06:28, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5293/48828 [08:08<1:06:34, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5297/48828 [08:08<1:06:54, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5299/48828 [08:09<1:06:37, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5301/48828 [08:09<1:08:21, 10.61it/s]

Sparsity: 44.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10854400 | Self Similarity: 0.11
Sparsity: 44.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10854400 | Self Similarity: 0.11
Sparsity: 44.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10854400 | Self Similarity: 0.11
Sparsity: 44.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10854400 | Self Similarity: 0.10
Sparsity: 44.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10854400 | Self Similarity: 0.11
Sparsity: 44.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 10854400 | Self Similarity: 0.11
Sparsity: 44.7 | Dead Features: 0 

 11%|█         | 5305/48828 [08:09<1:07:16, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5307/48828 [08:09<1:07:28, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5311/48828 [08:10<1:07:21, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5313/48828 [08:10<1:07:07, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5317/48828 [08:10<1:06:35, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5319/48828 [08:10<1:06:37, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5323/48828 [08:11<1:07:05, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5325/48828 [08:11<1:06:51, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5329/48828 [08:11<1:06:28, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5331/48828 [08:12<1:06:28, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5335/48828 [08:12<1:06:55, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5337/48828 [08:12<1:06:51, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5341/48828 [08:12<1:06:31, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5343/48828 [08:13<1:06:32, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5347/48828 [08:13<1:06:56, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5349/48828 [08:13<1:07:13, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5353/48828 [08:14<1:07:09, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5355/48828 [08:14<1:06:53, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5359/48828 [08:14<1:07:22, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5361/48828 [08:14<1:08:15, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5365/48828 [08:15<1:08:28, 10.58it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5367/48828 [08:15<1:08:11, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5371/48828 [08:15<1:07:58, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5373/48828 [08:15<1:07:53, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5377/48828 [08:16<1:08:39, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5379/48828 [08:16<1:09:02, 10.49it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5383/48828 [08:16<1:09:01, 10.49it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5385/48828 [08:17<1:08:34, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5389/48828 [08:17<1:08:15, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5391/48828 [08:17<1:07:57, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5395/48828 [08:18<1:08:03, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5397/48828 [08:18<1:08:28, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5401/48828 [08:18<1:09:47, 10.37it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 46.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11059200 | Self Similarity: 0.11
Sparsity: 47.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11059200 | Self Similarity: 0.11
Sparsity: 47.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11059200 | Self Similarity: 0.11
Sparsity: 46.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11059200 | Self Similarity: 0.11
Sparsity: 46.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11059200 | Self Similarity: 0.11
Sparsity: 47.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 11%|█         | 5403/48828 [08:18<1:09:21, 10.44it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5407/48828 [08:19<1:08:18, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5409/48828 [08:19<1:08:06, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5413/48828 [08:19<1:07:53, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5415/48828 [08:19<1:08:17, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5419/48828 [08:20<1:08:36, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5421/48828 [08:20<1:08:25, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5425/48828 [08:20<1:08:07, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5427/48828 [08:21<1:07:52, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5431/48828 [08:21<1:08:04, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5433/48828 [08:21<1:08:33, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5437/48828 [08:22<1:09:06, 10.46it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5439/48828 [08:22<1:08:54, 10.49it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5443/48828 [08:22<1:08:08, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5445/48828 [08:22<1:07:42, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5449/48828 [08:23<1:07:36, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5451/48828 [08:23<1:07:41, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5455/48828 [08:23<1:07:33, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5457/48828 [08:23<1:07:23, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5461/48828 [08:24<1:07:49, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5463/48828 [08:24<1:07:37, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5467/48828 [08:24<1:07:27, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5469/48828 [08:25<1:07:53, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5473/48828 [08:25<1:07:22, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5475/48828 [08:25<1:07:06, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5479/48828 [08:25<1:06:47, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5481/48828 [08:26<1:07:05, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5485/48828 [08:26<1:06:52, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5487/48828 [08:26<1:06:39, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5491/48828 [08:27<1:06:30, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█         | 5493/48828 [08:27<1:06:50, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5497/48828 [08:27<1:07:13, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5499/48828 [08:27<1:06:51, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 41.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11264000 | Self Similarity: 0.11


 11%|█▏        | 5501/48828 [08:27<1:08:10, 10.59it/s]

Sparsity: 42.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11264000 | Self Similarity: 0.11
Sparsity: 42.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11264000 | Self Similarity: 0.11
Sparsity: 42.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11264000 | Self Similarity: 0.11
Sparsity: 42.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11264000 | Self Similarity: 0.11
Sparsity: 42.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11264000 | Self Similarity: 0.11
Sparsity: 42.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11264000 | Self Similarity: 0.11
Sparsity: 42.0 | Dead Features: 0 

 11%|█▏        | 5505/48828 [08:28<1:07:33, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5507/48828 [08:28<1:07:14, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5511/48828 [08:28<1:07:36, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5513/48828 [08:29<1:07:22, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5517/48828 [08:29<1:06:43, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5519/48828 [08:29<1:06:33, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5523/48828 [08:30<1:06:45, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5525/48828 [08:30<1:06:52, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5529/48828 [08:30<1:06:29, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5531/48828 [08:30<1:06:19, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5535/48828 [08:31<1:06:19, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5537/48828 [08:31<1:06:43, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5541/48828 [08:31<1:06:25, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5543/48828 [08:31<1:06:15, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5547/48828 [08:32<1:06:23, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5549/48828 [08:32<1:06:56, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5553/48828 [08:32<1:06:46, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5555/48828 [08:32<1:06:36, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5559/48828 [08:33<1:06:26, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5561/48828 [08:33<1:06:48, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5565/48828 [08:33<1:06:51, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5567/48828 [08:34<1:06:36, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5571/48828 [08:34<1:06:17, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5573/48828 [08:34<1:06:31, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5577/48828 [08:35<1:07:03, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5579/48828 [08:35<1:06:57, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5583/48828 [08:35<1:06:58, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5585/48828 [08:35<1:06:48, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5589/48828 [08:36<1:07:14, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5591/48828 [08:36<1:07:28, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5595/48828 [08:36<1:06:52, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5597/48828 [08:36<1:06:35, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5601/48828 [08:37<1:07:47, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 46.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11468800 | Self Similarity: 0.11
Sparsity: 46.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11468800 | Self Similarity: 0.11
Sparsity: 46.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11468800 | Self Similarity: 0.11
Sparsity: 45.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11468800 | Self Similarity: 0.11
Sparsity: 46.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11468800 | Self Similarity: 0.11
Sparsity: 46.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 11%|█▏        | 5603/48828 [08:37<1:07:29, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5607/48828 [08:37<1:07:25, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5609/48828 [08:37<1:06:59, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5613/48828 [08:38<1:06:27, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 11%|█▏        | 5615/48828 [08:38<1:06:17, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5619/48828 [08:38<1:06:44, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5621/48828 [08:39<1:06:38, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5625/48828 [08:39<1:06:22, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5627/48828 [08:39<1:06:17, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5631/48828 [08:40<1:06:48, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5633/48828 [08:40<1:07:06, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5637/48828 [08:40<1:06:34, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5639/48828 [08:40<1:06:22, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5643/48828 [08:41<1:06:29, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5645/48828 [08:41<1:06:55, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5649/48828 [08:41<1:06:43, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5651/48828 [08:41<1:06:34, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5655/48828 [08:42<1:06:38, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5657/48828 [08:42<1:06:45, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5661/48828 [08:42<1:07:04, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5663/48828 [08:42<1:06:53, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5667/48828 [08:43<1:06:25, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5669/48828 [08:43<1:06:22, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5673/48828 [08:43<1:06:54, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5675/48828 [08:44<1:06:51, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5679/48828 [08:44<1:06:23, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5681/48828 [08:44<1:06:45, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5685/48828 [08:45<1:06:46, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5687/48828 [08:45<1:07:09, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5691/48828 [08:45<1:06:40, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5693/48828 [08:45<1:06:32, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5697/48828 [08:46<1:06:13, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5699/48828 [08:46<1:06:36, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5701/48828 [08:46<1:08:16, 10.53it/s]

Sparsity: 44.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11673600 | Self Similarity: 0.11
Sparsity: 44.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11673600 | Self Similarity: 0.11
Sparsity: 44.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11673600 | Self Similarity: 0.11
Sparsity: 44.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11673600 | Self Similarity: 0.11
Sparsity: 44.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11673600 | Self Similarity: 0.11
Sparsity: 45.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11673600 | Self Similarity: 0.11
Sparsity: 44.8 | Dead Features: 0 

 12%|█▏        | 5705/48828 [08:46<1:07:30, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5707/48828 [08:47<1:06:57, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5711/48828 [08:47<1:06:18, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5713/48828 [08:47<1:06:08, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5717/48828 [08:48<1:06:36, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5719/48828 [08:48<1:06:26, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5723/48828 [08:48<1:06:14, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5725/48828 [08:48<1:06:11, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5729/48828 [08:49<1:06:32, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5731/48828 [08:49<1:06:48, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5735/48828 [08:49<1:06:19, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5737/48828 [08:49<1:06:08, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5741/48828 [08:50<1:06:53, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5743/48828 [08:50<1:07:10, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5747/48828 [08:50<1:07:03, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5749/48828 [08:50<1:06:40, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5753/48828 [08:51<1:06:26, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5755/48828 [08:51<1:06:39, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5759/48828 [08:51<1:06:31, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5761/48828 [08:52<1:06:17, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5765/48828 [08:52<1:06:25, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5767/48828 [08:52<1:06:22, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5771/48828 [08:53<1:06:38, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5773/48828 [08:53<1:06:27, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5777/48828 [08:53<1:06:08, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5779/48828 [08:53<1:06:18, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5783/48828 [08:54<1:07:09, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5785/48828 [08:54<1:07:40, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5789/48828 [08:54<1:06:46, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5791/48828 [08:54<1:06:26, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5795/48828 [08:55<1:06:11, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5797/48828 [08:55<1:06:19, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5801/48828 [08:55<1:07:41, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 48.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11878400 | Self Similarity: 0.11
Sparsity: 48.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11878400 | Self Similarity: 0.11
Sparsity: 48.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11878400 | Self Similarity: 0.11
Sparsity: 47.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11878400 | Self Similarity: 0.11
Sparsity: 48.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 11878400 | Self Similarity: 0.11
Sparsity: 48.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 12%|█▏        | 5803/48828 [08:55<1:07:03, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5807/48828 [08:56<1:06:24, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5809/48828 [08:56<1:06:15, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5813/48828 [08:56<1:06:46, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5815/48828 [08:57<1:06:34, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5819/48828 [08:57<1:06:06, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5821/48828 [08:57<1:06:00, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5825/48828 [08:58<1:06:13, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5827/48828 [08:58<1:06:16, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5831/48828 [08:58<1:05:51, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5833/48828 [08:58<1:05:45, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5837/48828 [08:59<1:06:05, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5839/48828 [08:59<1:06:20, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5843/48828 [08:59<1:05:53, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5845/48828 [08:59<1:05:46, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5849/48828 [09:00<1:05:54, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5851/48828 [09:00<1:06:09, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5855/48828 [09:00<1:05:45, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5857/48828 [09:00<1:05:40, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5861/48828 [09:01<1:05:53, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5863/48828 [09:01<1:06:09, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5867/48828 [09:01<1:05:57, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5869/48828 [09:02<1:06:32, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5873/48828 [09:02<1:06:04, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5875/48828 [09:02<1:06:14, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5879/48828 [09:02<1:06:16, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5881/48828 [09:03<1:05:57, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5885/48828 [09:03<1:05:45, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5887/48828 [09:03<1:06:00, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5891/48828 [09:04<1:06:37, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5893/48828 [09:04<1:06:29, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5897/48828 [09:04<1:06:15, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5899/48828 [09:04<1:06:07, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5901/48828 [09:05<1:07:44, 10.56it/s]

Sparsity: 42.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12083200 | Self Similarity: 0.11
Sparsity: 42.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12083200 | Self Similarity: 0.11
Sparsity: 42.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12083200 | Self Similarity: 0.11
Sparsity: 42.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12083200 | Self Similarity: 0.11
Sparsity: 42.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12083200 | Self Similarity: 0.11
Sparsity: 42.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12083200 | Self Similarity: 0.11
Sparsity: 42.0 | Dead Features: 0 

 12%|█▏        | 5905/48828 [09:05<1:07:07, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5907/48828 [09:05<1:06:56, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5911/48828 [09:05<1:06:20, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5913/48828 [09:06<1:06:09, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5917/48828 [09:06<1:06:44, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5919/48828 [09:06<1:06:41, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5923/48828 [09:07<1:06:08, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5925/48828 [09:07<1:05:57, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5929/48828 [09:07<1:05:46, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5931/48828 [09:07<1:06:01, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5935/48828 [09:08<1:06:06, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5937/48828 [09:08<1:05:57, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5941/48828 [09:08<1:05:53, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5943/48828 [09:08<1:06:05, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5947/48828 [09:09<1:06:11, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5949/48828 [09:09<1:05:55, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5953/48828 [09:09<1:05:34, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5955/48828 [09:10<1:05:36, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5959/48828 [09:10<1:06:12, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5961/48828 [09:10<1:05:59, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5965/48828 [09:10<1:05:36, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5967/48828 [09:11<1:05:33, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5971/48828 [09:11<1:06:08, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5973/48828 [09:11<1:06:01, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5977/48828 [09:12<1:06:17, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5979/48828 [09:12<1:07:03, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5983/48828 [09:12<1:07:47, 10.53it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5985/48828 [09:12<1:08:08, 10.48it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5989/48828 [09:13<1:09:04, 10.34it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5991/48828 [09:13<1:09:31, 10.27it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5995/48828 [09:13<1:10:05, 10.18it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5997/48828 [09:14<1:10:34, 10.11it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 5999/48828 [09:14<1:11:00, 10.05it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6001/48828 [09:14<1:13:30,  9.71it/s]

representation shape is: torch.Size([8, 256, 512])
Sparsity: 45.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12288000 | Self Similarity: 0.12
Sparsity: 45.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12288000 | Self Similarity: 0.11
Sparsity: 45.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12288000 | Self Similarity: 0.11
Sparsity: 44.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12288000 | Self Similarity: 0.11
Sparsity: 45.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12288000 | Self Similarity: 0.12
Sparsity: 45.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12288000 | Self 

 12%|█▏        | 6003/48828 [09:14<1:12:11,  9.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6006/48828 [09:14<1:11:42,  9.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6010/48828 [09:15<1:10:55, 10.06it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6012/48828 [09:15<1:10:46, 10.08it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6016/48828 [09:15<1:10:46, 10.08it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6018/48828 [09:16<1:10:32, 10.11it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6022/48828 [09:16<1:10:07, 10.17it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6024/48828 [09:16<1:09:28, 10.27it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6028/48828 [09:17<1:07:38, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6030/48828 [09:17<1:07:08, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6034/48828 [09:17<1:06:55, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6036/48828 [09:17<1:07:23, 10.58it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6040/48828 [09:18<1:07:07, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6042/48828 [09:18<1:06:42, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6046/48828 [09:18<1:06:01, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6048/48828 [09:18<1:06:04, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6052/48828 [09:19<1:06:31, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6054/48828 [09:19<1:06:30, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6058/48828 [09:19<1:06:21, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6060/48828 [09:20<1:06:04, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6064/48828 [09:20<1:06:29, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6066/48828 [09:20<1:06:51, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6070/48828 [09:20<1:06:47, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6072/48828 [09:21<1:06:24, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6076/48828 [09:21<1:06:14, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6078/48828 [09:21<1:06:12, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6082/48828 [09:22<1:07:14, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6084/48828 [09:22<1:07:29, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6088/48828 [09:22<1:06:42, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6090/48828 [09:22<1:06:48, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6094/48828 [09:23<1:06:06, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6096/48828 [09:23<1:06:15, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 12%|█▏        | 6100/48828 [09:23<1:06:18, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 44.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12492800 | Self Similarity: 0.12

 12%|█▏        | 6102/48828 [09:23<1:07:26, 10.56it/s]


Sparsity: 44.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12492800 | Self Similarity: 0.11
Sparsity: 45.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12492800 | Self Similarity: 0.12
Sparsity: 44.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12492800 | Self Similarity: 0.11
Sparsity: 44.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12492800 | Self Similarity: 0.12
Sparsity: 45.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12492800 | Self Similarity: 0.11
Sparsity: 44.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12492800 | Self Similarity: 0.11
Sparsity: 44.8 | Dead Features: 0

 13%|█▎        | 6104/48828 [09:24<1:06:51, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6108/48828 [09:24<1:05:58, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6110/48828 [09:24<1:05:48, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6114/48828 [09:25<1:05:57, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6116/48828 [09:25<1:05:35, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6120/48828 [09:25<1:05:08, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6122/48828 [09:25<1:05:15, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6126/48828 [09:26<1:05:57, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6128/48828 [09:26<1:05:47, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6132/48828 [09:26<1:05:10, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6134/48828 [09:26<1:05:07, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6138/48828 [09:27<1:05:51, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6140/48828 [09:27<1:05:45, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6144/48828 [09:27<1:05:12, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6146/48828 [09:28<1:05:02, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6150/48828 [09:28<1:05:39, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6152/48828 [09:28<1:05:37, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6156/48828 [09:28<1:05:13, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6158/48828 [09:29<1:05:03, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6162/48828 [09:29<1:05:39, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6164/48828 [09:29<1:05:44, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6168/48828 [09:30<1:05:09, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6170/48828 [09:30<1:05:03, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6174/48828 [09:30<1:05:33, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6176/48828 [09:30<1:05:40, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6180/48828 [09:31<1:05:16, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6182/48828 [09:31<1:05:05, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6186/48828 [09:31<1:05:34, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6188/48828 [09:31<1:06:07, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6192/48828 [09:32<1:05:45, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6194/48828 [09:32<1:05:21, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6198/48828 [09:32<1:05:45, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6200/48828 [09:33<1:06:07, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 43.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12697600 | Self Similarity: 0.12
Sparsity: 42.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12697600 | Self Similarity: 0.11
Sparsity: 42.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12697600 | Self Similarity: 0.12
Sparsity: 42.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12697600 | Self Similarity: 0.11
Sparsity: 43.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12697600 | Self Similarity: 0.12
Sparsity: 43.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 13%|█▎        | 6202/48828 [09:33<1:08:05, 10.43it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6206/48828 [09:33<1:07:42, 10.49it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6208/48828 [09:33<1:07:09, 10.58it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6212/48828 [09:34<1:06:05, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6214/48828 [09:34<1:05:59, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6218/48828 [09:34<1:06:14, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6220/48828 [09:34<1:06:16, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6224/48828 [09:35<1:05:36, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6226/48828 [09:35<1:05:19, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6230/48828 [09:35<1:05:27, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6232/48828 [09:36<1:06:03, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6236/48828 [09:36<1:05:39, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6238/48828 [09:36<1:05:17, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6242/48828 [09:36<1:05:19, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6244/48828 [09:37<1:05:39, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6248/48828 [09:37<1:05:31, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6250/48828 [09:37<1:05:20, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6254/48828 [09:38<1:05:10, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6256/48828 [09:38<1:05:27, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6260/48828 [09:38<1:05:52, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6262/48828 [09:38<1:05:35, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6266/48828 [09:39<1:05:02, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6268/48828 [09:39<1:05:04, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6272/48828 [09:39<1:05:33, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6274/48828 [09:39<1:05:18, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6278/48828 [09:40<1:04:58, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6280/48828 [09:40<1:04:56, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6284/48828 [09:40<1:05:33, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6286/48828 [09:40<1:05:24, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6290/48828 [09:41<1:04:51, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6292/48828 [09:41<1:04:49, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6296/48828 [09:41<1:05:29, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6298/48828 [09:42<1:05:19, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6302/48828 [09:42<1:06:37, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
Sparsity: 45.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12902400 | Self Similarity: 0.12
Sparsity: 45.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12902400 | Self Similarity: 0.12
Sparsity: 45.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12902400 | Self Similarity: 0.12
Sparsity: 45.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12902400 | Self Similarity: 0.11
Sparsity: 45.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12902400 | Self Similarity: 0.12
Sparsity: 46.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 12902400 | Self 

 13%|█▎        | 6304/48828 [09:42<1:05:59, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6308/48828 [09:43<1:05:50, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6310/48828 [09:43<1:06:06, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6314/48828 [09:43<1:05:22, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6316/48828 [09:43<1:05:02, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6320/48828 [09:44<1:05:04, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6322/48828 [09:44<1:05:35, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6326/48828 [09:44<1:05:18, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6328/48828 [09:44<1:05:03, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6332/48828 [09:45<1:04:58, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6334/48828 [09:45<1:05:23, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6338/48828 [09:45<1:05:42, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6340/48828 [09:45<1:05:20, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6344/48828 [09:46<1:04:49, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6346/48828 [09:46<1:05:02, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6350/48828 [09:46<1:05:33, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6352/48828 [09:47<1:05:11, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6356/48828 [09:47<1:04:47, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6358/48828 [09:47<1:04:50, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6362/48828 [09:48<1:05:47, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6364/48828 [09:48<1:05:28, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6368/48828 [09:48<1:04:58, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6370/48828 [09:48<1:04:49, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6374/48828 [09:49<1:05:27, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6376/48828 [09:49<1:05:16, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6380/48828 [09:49<1:04:43, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6382/48828 [09:49<1:04:41, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6386/48828 [09:50<1:05:29, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6388/48828 [09:50<1:05:21, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6392/48828 [09:50<1:04:48, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6394/48828 [09:50<1:04:46, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6398/48828 [09:51<1:05:31, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6400/48828 [09:51<1:05:29, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 41.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13107200 | Self Similarity: 0.12
Sparsity: 41.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13107200 | Self Similarity: 0.12
Sparsity: 41.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13107200 | Self Similarity: 0.12
Sparsity: 41.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13107200 | Self Similarity: 0.12
Sparsity: 41.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13107200 | Self Similarity: 0.12
Sparsity: 41.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 13%|█▎        | 6404/48828 [09:51<1:06:04, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6406/48828 [09:52<1:05:27, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6410/48828 [09:52<1:05:08, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6412/48828 [09:52<1:05:36, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6416/48828 [09:52<1:05:15, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6418/48828 [09:53<1:04:51, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6422/48828 [09:53<1:04:40, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6424/48828 [09:53<1:05:08, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6428/48828 [09:54<1:05:08, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6430/48828 [09:54<1:05:22, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6434/48828 [09:54<1:04:51, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6436/48828 [09:54<1:05:09, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6440/48828 [09:55<1:05:33, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6442/48828 [09:55<1:05:16, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6446/48828 [09:55<1:04:47, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6448/48828 [09:55<1:05:04, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6452/48828 [09:56<1:06:01, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6454/48828 [09:56<1:05:42, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6458/48828 [09:56<1:06:06, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6460/48828 [09:57<1:06:19, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6464/48828 [09:57<1:06:06, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6466/48828 [09:57<1:06:08, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6470/48828 [09:58<1:06:41, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6472/48828 [09:58<1:06:04, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6476/48828 [09:58<1:05:44, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6478/48828 [09:58<1:05:21, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6482/48828 [09:59<1:05:45, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6484/48828 [09:59<1:05:48, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6488/48828 [09:59<1:05:15, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6490/48828 [09:59<1:04:57, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6494/48828 [10:00<1:05:05, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6496/48828 [10:00<1:06:11, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6500/48828 [10:00<1:05:33, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 54.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13312000 | Self Similarity: 0.12


 13%|█▎        | 6502/48828 [10:00<1:06:54, 10.54it/s]

Sparsity: 54.3 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13312000 | Self Similarity: 0.12
Sparsity: 54.4 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13312000 | Self Similarity: 0.12
Sparsity: 54.1 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13312000 | Self Similarity: 0.12
Sparsity: 54.7 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13312000 | Self Similarity: 0.12
Sparsity: 54.6 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13312000 | Self Similarity: 0.12
Sparsity: 53.8 | Dead Features: 0 | Total Loss: 0.03 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13312000 | Self Similarity: 0.12
Sparsity: 54.3 | Dead Features: 0 

 13%|█▎        | 6504/48828 [10:01<1:06:07, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6508/48828 [10:01<1:05:21, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6510/48828 [10:01<1:05:19, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6514/48828 [10:02<1:05:25, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6516/48828 [10:02<1:05:10, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6520/48828 [10:02<1:05:03, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6522/48828 [10:02<1:05:50, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6526/48828 [10:03<1:06:04, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6528/48828 [10:03<1:05:50, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6532/48828 [10:03<1:05:10, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6534/48828 [10:03<1:05:04, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6538/48828 [10:04<1:06:12, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6540/48828 [10:04<1:06:23, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6544/48828 [10:04<1:05:58, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6546/48828 [10:05<1:05:35, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6550/48828 [10:05<1:05:54, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6552/48828 [10:05<1:05:43, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6556/48828 [10:06<1:06:18, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6558/48828 [10:06<1:06:13, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6562/48828 [10:06<1:05:32, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6564/48828 [10:06<1:05:17, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6568/48828 [10:07<1:05:08, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6570/48828 [10:07<1:05:33, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6574/48828 [10:07<1:05:38, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6576/48828 [10:07<1:05:23, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6580/48828 [10:08<1:05:12, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6582/48828 [10:08<1:05:23, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6586/48828 [10:08<1:05:51, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 13%|█▎        | 6588/48828 [10:08<1:05:35, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6592/48828 [10:09<1:05:22, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6594/48828 [10:09<1:05:11, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6598/48828 [10:09<1:05:35, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6600/48828 [10:10<1:05:57, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 41.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13516800 | Self Similarity: 0.12
Sparsity: 41.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13516800 | Self Similarity: 0.12
Sparsity: 41.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13516800 | Self Similarity: 0.12
Sparsity: 41.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13516800 | Self Similarity: 0.11
Sparsity: 41.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13516800 | Self Similarity: 0.12
Sparsity: 41.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 14%|█▎        | 6604/48828 [10:10<1:06:45, 10.54it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6606/48828 [10:10<1:06:09, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6610/48828 [10:11<1:05:24, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6612/48828 [10:11<1:05:12, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6616/48828 [10:11<1:05:46, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6618/48828 [10:11<1:05:32, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6622/48828 [10:12<1:05:02, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6624/48828 [10:12<1:05:06, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6628/48828 [10:12<1:05:49, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6630/48828 [10:12<1:05:57, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6634/48828 [10:13<1:06:09, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6636/48828 [10:13<1:06:00, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6640/48828 [10:13<1:05:40, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6642/48828 [10:14<1:05:29, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6646/48828 [10:14<1:06:06, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6648/48828 [10:14<1:05:48, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6652/48828 [10:14<1:04:55, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6654/48828 [10:15<1:04:39, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6658/48828 [10:15<1:05:19, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6660/48828 [10:15<1:06:03, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6664/48828 [10:16<1:05:39, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6666/48828 [10:16<1:05:36, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6670/48828 [10:16<1:05:24, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6672/48828 [10:16<1:05:42, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6676/48828 [10:17<1:06:14, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6678/48828 [10:17<1:06:02, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6682/48828 [10:17<1:05:32, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6684/48828 [10:17<1:05:23, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6688/48828 [10:18<1:05:44, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6690/48828 [10:18<1:06:04, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6694/48828 [10:18<1:06:00, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6696/48828 [10:19<1:05:56, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6700/48828 [10:19<1:05:35, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6702/48828 [10:19<1:06:45, 10.52it/s]

Sparsity: 37.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13721600 | Self Similarity: 0.12
Sparsity: 37.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13721600 | Self Similarity: 0.12
Sparsity: 37.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13721600 | Self Similarity: 0.12
Sparsity: 37.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13721600 | Self Similarity: 0.12
Sparsity: 38.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13721600 | Self Similarity: 0.12
Sparsity: 37.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13721600 | Self Similarity: 0.12
Sparsity: 37.5 | Dead Features: 0 

 14%|█▎        | 6704/48828 [10:19<1:06:16, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6708/48828 [10:20<1:05:48, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▎        | 6710/48828 [10:20<1:05:32, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6714/48828 [10:20<1:04:47, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6716/48828 [10:20<1:04:38, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6720/48828 [10:21<1:05:04, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6722/48828 [10:21<1:05:06, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6726/48828 [10:21<1:04:53, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6728/48828 [10:22<1:04:38, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6732/48828 [10:22<1:05:07, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6734/48828 [10:22<1:05:18, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6738/48828 [10:22<1:05:02, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6740/48828 [10:23<1:05:00, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6744/48828 [10:23<1:04:47, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6746/48828 [10:23<1:04:59, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6750/48828 [10:24<1:05:32, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6752/48828 [10:24<1:05:04, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6756/48828 [10:24<1:04:50, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6758/48828 [10:24<1:04:44, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6762/48828 [10:25<1:05:19, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6764/48828 [10:25<1:05:14, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6768/48828 [10:25<1:04:47, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6770/48828 [10:25<1:04:36, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6774/48828 [10:26<1:05:25, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6776/48828 [10:26<1:05:26, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6780/48828 [10:26<1:04:48, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6782/48828 [10:27<1:04:28, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6786/48828 [10:27<1:04:21, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6788/48828 [10:27<1:04:35, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6792/48828 [10:27<1:04:13, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6794/48828 [10:28<1:04:15, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6798/48828 [10:28<1:04:18, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6800/48828 [10:28<1:04:47, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 41.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13926400 | Self Similarity: 0.12
Sparsity: 41.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13926400 | Self Similarity: 0.12
Sparsity: 41.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13926400 | Self Similarity: 0.12
Sparsity: 41.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13926400 | Self Similarity: 0.12
Sparsity: 41.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 13926400 | Self Similarity: 0.12
Sparsity: 41.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 14%|█▍        | 6804/48828 [10:29<1:05:34, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6806/48828 [10:29<1:05:03, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6810/48828 [10:29<1:04:24, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6812/48828 [10:29<1:04:24, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6816/48828 [10:30<1:04:35, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6818/48828 [10:30<1:04:22, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6822/48828 [10:30<1:04:02, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6824/48828 [10:30<1:04:06, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6828/48828 [10:31<1:04:24, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6830/48828 [10:31<1:04:10, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6834/48828 [10:31<1:03:58, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6836/48828 [10:32<1:04:23, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6840/48828 [10:32<1:05:23, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6842/48828 [10:32<1:05:04, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6846/48828 [10:32<1:04:23, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6848/48828 [10:33<1:04:11, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6852/48828 [10:33<1:04:50, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6854/48828 [10:33<1:04:48, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6858/48828 [10:34<1:04:40, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6860/48828 [10:34<1:04:24, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6864/48828 [10:34<1:04:47, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6866/48828 [10:34<1:04:53, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6870/48828 [10:35<1:04:25, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6872/48828 [10:35<1:04:14, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6876/48828 [10:35<1:04:18, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6878/48828 [10:35<1:04:49, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6882/48828 [10:36<1:04:25, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6884/48828 [10:36<1:04:13, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6888/48828 [10:36<1:04:07, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6890/48828 [10:37<1:04:29, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6894/48828 [10:37<1:04:29, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6896/48828 [10:37<1:04:22, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6900/48828 [10:37<1:04:05, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6902/48828 [10:38<1:05:48, 10.62it/s]

Sparsity: 46.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14131200 | Self Similarity: 0.12
Sparsity: 46.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14131200 | Self Similarity: 0.12
Sparsity: 46.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14131200 | Self Similarity: 0.12
Sparsity: 46.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14131200 | Self Similarity: 0.12
Sparsity: 46.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14131200 | Self Similarity: 0.12
Sparsity: 46.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14131200 | Self Similarity: 0.12
Sparsity: 46.4 | Dead Features: 0 

 14%|█▍        | 6904/48828 [10:38<1:05:29, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6908/48828 [10:38<1:04:51, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6910/48828 [10:38<1:04:35, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6914/48828 [10:39<1:04:13, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6916/48828 [10:39<1:04:19, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6920/48828 [10:39<1:04:27, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6922/48828 [10:39<1:04:17, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6926/48828 [10:40<1:04:05, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6928/48828 [10:40<1:04:12, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6932/48828 [10:40<1:04:20, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6934/48828 [10:41<1:04:09, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6938/48828 [10:41<1:03:59, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6940/48828 [10:41<1:03:57, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6944/48828 [10:42<1:04:51, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6946/48828 [10:42<1:04:33, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6950/48828 [10:42<1:04:29, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6952/48828 [10:42<1:04:19, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6956/48828 [10:43<1:04:35, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6958/48828 [10:43<1:04:29, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6962/48828 [10:43<1:04:02, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6964/48828 [10:43<1:04:09, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6968/48828 [10:44<1:04:23, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6970/48828 [10:44<1:04:27, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6974/48828 [10:44<1:03:59, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6976/48828 [10:44<1:04:02, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6980/48828 [10:45<1:04:08, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6982/48828 [10:45<1:04:25, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6986/48828 [10:45<1:03:57, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6988/48828 [10:46<1:04:05, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6992/48828 [10:46<1:04:24, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6994/48828 [10:46<1:04:36, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 6998/48828 [10:46<1:04:03, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7000/48828 [10:47<1:03:51, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 45.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14336000 | Self Similarity: 0.12
Sparsity: 45.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14336000 | Self Similarity: 0.12
Sparsity: 45.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14336000 | Self Similarity: 0.12
Sparsity: 44.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14336000 | Self Similarity: 0.12
Sparsity: 45.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14336000 | Self Similarity: 0.12
Sparsity: 45.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 14%|█▍        | 7004/48828 [10:47<1:04:44, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7006/48828 [10:47<1:04:42, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7010/48828 [10:48<1:04:31, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7012/48828 [10:48<1:04:14, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7016/48828 [10:48<1:04:11, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7018/48828 [10:48<1:04:19, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7022/48828 [10:49<1:04:18, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7024/48828 [10:49<1:04:00, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7028/48828 [10:49<1:04:00, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7030/48828 [10:49<1:04:02, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7034/48828 [10:50<1:04:24, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7036/48828 [10:50<1:04:17, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7040/48828 [10:50<1:04:21, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7042/48828 [10:51<1:04:15, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7046/48828 [10:51<1:04:39, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7048/48828 [10:51<1:04:33, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7052/48828 [10:51<1:04:07, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7054/48828 [10:52<1:03:49, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7058/48828 [10:52<1:04:10, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7060/48828 [10:52<1:04:15, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7064/48828 [10:53<1:03:38, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7066/48828 [10:53<1:03:42, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7070/48828 [10:53<1:04:07, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7072/48828 [10:53<1:04:16, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7076/48828 [10:54<1:03:47, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 14%|█▍        | 7078/48828 [10:54<1:03:36, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7082/48828 [10:54<1:03:57, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7084/48828 [10:54<1:04:15, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7088/48828 [10:55<1:03:55, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7090/48828 [10:55<1:04:08, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7094/48828 [10:55<1:03:58, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7096/48828 [10:56<1:04:21, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7100/48828 [10:56<1:03:58, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 39.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14540800 | Self Similarity: 0.12


 15%|█▍        | 7102/48828 [10:56<1:05:23, 10.63it/s]

Sparsity: 39.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14540800 | Self Similarity: 0.12
Sparsity: 40.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14540800 | Self Similarity: 0.12
Sparsity: 39.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14540800 | Self Similarity: 0.12
Sparsity: 39.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14540800 | Self Similarity: 0.12
Sparsity: 40.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14540800 | Self Similarity: 0.12
Sparsity: 39.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14540800 | Self Similarity: 0.12
Sparsity: 40.1 | Dead Features: 0 

 15%|█▍        | 7104/48828 [10:56<1:04:55, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7108/48828 [10:57<1:04:20, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7110/48828 [10:57<1:05:17, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7114/48828 [10:57<1:05:40, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7116/48828 [10:57<1:05:18, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7120/48828 [10:58<1:04:58, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7122/48828 [10:58<1:04:47, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7126/48828 [10:58<1:05:04, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7128/48828 [10:59<1:05:05, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7132/48828 [10:59<1:04:33, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7134/48828 [10:59<1:04:26, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7138/48828 [10:59<1:04:37, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7140/48828 [11:00<1:04:57, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7144/48828 [11:00<1:04:48, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7146/48828 [11:00<1:04:27, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7150/48828 [11:01<1:03:59, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7152/48828 [11:01<1:04:05, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7156/48828 [11:01<1:05:23, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7158/48828 [11:01<1:05:29, 10.60it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7162/48828 [11:02<1:04:59, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7164/48828 [11:02<1:04:30, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7168/48828 [11:02<1:04:24, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7170/48828 [11:02<1:04:56, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7174/48828 [11:03<1:04:27, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7176/48828 [11:03<1:04:20, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7180/48828 [11:03<1:04:16, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7182/48828 [11:04<1:04:36, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7186/48828 [11:04<1:04:27, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7188/48828 [11:04<1:04:01, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7192/48828 [11:04<1:03:46, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7194/48828 [11:05<1:04:04, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7198/48828 [11:05<1:04:35, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7200/48828 [11:05<1:04:20, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 45.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14745600 | Self Similarity: 0.12
Sparsity: 45.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14745600 | Self Similarity: 0.12
Sparsity: 45.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14745600 | Self Similarity: 0.12
Sparsity: 45.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14745600 | Self Similarity: 0.12
Sparsity: 45.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14745600 | Self Similarity: 0.13
Sparsity: 45.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 15%|█▍        | 7204/48828 [11:06<1:05:11, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7206/48828 [11:06<1:04:46, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7210/48828 [11:06<1:04:16, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7212/48828 [11:06<1:04:23, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7216/48828 [11:07<1:03:55, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7218/48828 [11:07<1:03:45, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7222/48828 [11:07<1:03:46, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7224/48828 [11:07<1:04:08, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7228/48828 [11:08<1:04:02, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7230/48828 [11:08<1:03:52, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7234/48828 [11:08<1:03:37, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7236/48828 [11:09<1:04:04, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7240/48828 [11:09<1:04:11, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7242/48828 [11:09<1:03:52, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7246/48828 [11:09<1:03:39, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7248/48828 [11:10<1:03:45, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7252/48828 [11:10<1:04:04, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7254/48828 [11:10<1:03:54, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7258/48828 [11:11<1:03:37, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7260/48828 [11:11<1:03:34, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7264/48828 [11:11<1:04:03, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7266/48828 [11:11<1:03:48, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7270/48828 [11:12<1:03:26, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7272/48828 [11:12<1:03:29, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7276/48828 [11:12<1:04:04, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7278/48828 [11:12<1:04:03, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7282/48828 [11:13<1:03:41, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7284/48828 [11:13<1:04:01, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7288/48828 [11:13<1:04:29, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7290/48828 [11:14<1:04:32, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7294/48828 [11:14<1:04:04, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7296/48828 [11:14<1:03:50, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7300/48828 [11:14<1:03:44, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7302/48828 [11:15<1:05:47, 10.52it/s]

Sparsity: 38.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14950400 | Self Similarity: 0.12
Sparsity: 39.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14950400 | Self Similarity: 0.12
Sparsity: 39.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14950400 | Self Similarity: 0.12
Sparsity: 38.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14950400 | Self Similarity: 0.12
Sparsity: 38.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14950400 | Self Similarity: 0.13
Sparsity: 38.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 14950400 | Self Similarity: 0.12
Sparsity: 38.7 | Dead Features: 0 

 15%|█▍        | 7304/48828 [11:15<1:05:29, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7308/48828 [11:15<1:05:13, 10.61it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7310/48828 [11:15<1:04:41, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7314/48828 [11:16<1:04:17, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7316/48828 [11:16<1:04:05, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7320/48828 [11:16<1:04:17, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▍        | 7322/48828 [11:16<1:03:57, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7326/48828 [11:17<1:03:55, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7328/48828 [11:17<1:03:51, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7332/48828 [11:17<1:04:08, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7334/48828 [11:18<1:04:04, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7338/48828 [11:18<1:03:37, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7340/48828 [11:18<1:03:31, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7344/48828 [11:19<1:03:51, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7346/48828 [11:19<1:04:00, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7350/48828 [11:19<1:03:36, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7352/48828 [11:19<1:03:28, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7356/48828 [11:20<1:03:47, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7358/48828 [11:20<1:03:58, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7362/48828 [11:20<1:04:47, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7364/48828 [11:20<1:04:25, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7368/48828 [11:21<1:04:35, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7370/48828 [11:21<1:04:57, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7374/48828 [11:21<1:05:32, 10.54it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7376/48828 [11:22<1:05:22, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7380/48828 [11:22<1:04:22, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7382/48828 [11:22<1:04:29, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7386/48828 [11:22<1:04:02, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7388/48828 [11:23<1:04:52, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7392/48828 [11:23<1:06:19, 10.41it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7394/48828 [11:23<1:05:48, 10.49it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7398/48828 [11:24<1:04:36, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7400/48828 [11:24<1:04:10, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 34.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15155200 | Self Similarity: 0.13
Sparsity: 35.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15155200 | Self Similarity: 0.12
Sparsity: 35.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15155200 | Self Similarity: 0.12
Sparsity: 35.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15155200 | Self Similarity: 0.12
Sparsity: 34.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15155200 | Self Similarity: 0.13
Sparsity: 35.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 15%|█▌        | 7404/48828 [11:24<1:04:55, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7406/48828 [11:24<1:04:39, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7410/48828 [11:25<1:04:17, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7412/48828 [11:25<1:03:51, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7416/48828 [11:25<1:03:49, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7418/48828 [11:25<1:03:51, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7422/48828 [11:26<1:03:52, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7424/48828 [11:26<1:03:35, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7428/48828 [11:26<1:03:20, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7430/48828 [11:27<1:03:28, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7434/48828 [11:27<1:03:58, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7436/48828 [11:27<1:03:42, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7440/48828 [11:27<1:03:34, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7442/48828 [11:28<1:03:32, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7446/48828 [11:28<1:03:54, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7448/48828 [11:28<1:03:53, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7452/48828 [11:29<1:03:42, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7454/48828 [11:29<1:03:32, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7458/48828 [11:29<1:04:00, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7460/48828 [11:29<1:04:03, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7464/48828 [11:30<1:03:28, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7466/48828 [11:30<1:03:25, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7470/48828 [11:30<1:03:22, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7472/48828 [11:30<1:04:18, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7476/48828 [11:31<1:03:49, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7478/48828 [11:31<1:03:36, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7482/48828 [11:31<1:03:18, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7484/48828 [11:32<1:03:45, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7488/48828 [11:32<1:04:19, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7490/48828 [11:32<1:03:59, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7494/48828 [11:32<1:03:32, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7496/48828 [11:33<1:03:47, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7500/48828 [11:33<1:04:06, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7502/48828 [11:33<1:05:37, 10.49it/s]

Sparsity: 40.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15360000 | Self Similarity: 0.13
Sparsity: 41.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15360000 | Self Similarity: 0.12
Sparsity: 41.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15360000 | Self Similarity: 0.12
Sparsity: 40.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15360000 | Self Similarity: 0.12
Sparsity: 40.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15360000 | Self Similarity: 0.13
Sparsity: 41.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15360000 | Self Similarity: 0.12
Sparsity: 41.0 | Dead Features: 0 

 15%|█▌        | 7504/48828 [11:33<1:05:06, 10.58it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7508/48828 [11:34<1:04:22, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7510/48828 [11:34<1:03:53, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7514/48828 [11:34<1:03:47, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7516/48828 [11:35<1:04:02, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7520/48828 [11:35<1:03:49, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7522/48828 [11:35<1:03:28, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7526/48828 [11:35<1:03:56, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7528/48828 [11:36<1:04:20, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7532/48828 [11:36<1:04:44, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7534/48828 [11:36<1:04:12, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7538/48828 [11:37<1:03:29, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7540/48828 [11:37<1:03:22, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7544/48828 [11:37<1:03:42, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7546/48828 [11:37<1:03:33, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7550/48828 [11:38<1:03:10, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7552/48828 [11:38<1:03:07, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7556/48828 [11:38<1:03:42, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7558/48828 [11:38<1:03:29, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7562/48828 [11:39<1:03:06, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7564/48828 [11:39<1:03:01, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 15%|█▌        | 7568/48828 [11:39<1:03:24, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7570/48828 [11:40<1:03:22, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7574/48828 [11:40<1:03:17, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7576/48828 [11:40<1:03:09, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7580/48828 [11:40<1:03:41, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7582/48828 [11:41<1:03:48, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7586/48828 [11:41<1:03:10, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7588/48828 [11:41<1:03:02, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7592/48828 [11:42<1:03:17, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7594/48828 [11:42<1:03:30, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7598/48828 [11:42<1:03:06, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7600/48828 [11:42<1:02:52, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 42.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15564800 | Self Similarity: 0.13
Sparsity: 42.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15564800 | Self Similarity: 0.12
Sparsity: 42.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15564800 | Self Similarity: 0.12
Sparsity: 42.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15564800 | Self Similarity: 0.12
Sparsity: 42.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15564800 | Self Similarity: 0.13
Sparsity: 43.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 16%|█▌        | 7604/48828 [11:43<1:03:51, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7606/48828 [11:43<1:04:06, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7610/48828 [11:43<1:03:47, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7612/48828 [11:43<1:03:26, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7616/48828 [11:44<1:03:00, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7618/48828 [11:44<1:03:15, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7622/48828 [11:44<1:03:30, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7624/48828 [11:44<1:03:18, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7628/48828 [11:45<1:03:07, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7630/48828 [11:45<1:03:29, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7634/48828 [11:45<1:03:51, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7636/48828 [11:46<1:03:35, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7640/48828 [11:46<1:03:16, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7642/48828 [11:46<1:03:03, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7646/48828 [11:47<1:03:27, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7648/48828 [11:47<1:03:31, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7652/48828 [11:47<1:03:09, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7654/48828 [11:47<1:03:00, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7658/48828 [11:48<1:03:09, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7660/48828 [11:48<1:03:22, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7664/48828 [11:48<1:03:02, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7666/48828 [11:48<1:02:55, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7670/48828 [11:49<1:03:02, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7672/48828 [11:49<1:03:22, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7676/48828 [11:49<1:03:01, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7678/48828 [11:49<1:02:56, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7682/48828 [11:50<1:02:58, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7684/48828 [11:50<1:03:13, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7688/48828 [11:50<1:02:57, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7690/48828 [11:51<1:02:48, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7694/48828 [11:51<1:02:56, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7696/48828 [11:51<1:03:06, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7700/48828 [11:51<1:02:51, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 39.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15769600 | Self Similarity: 0.13


 16%|█▌        | 7702/48828 [11:52<1:04:13, 10.67it/s]

Sparsity: 38.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15769600 | Self Similarity: 0.12
Sparsity: 39.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15769600 | Self Similarity: 0.13
Sparsity: 39.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15769600 | Self Similarity: 0.12
Sparsity: 38.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15769600 | Self Similarity: 0.13
Sparsity: 39.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15769600 | Self Similarity: 0.12
Sparsity: 39.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15769600 | Self Similarity: 0.12
Sparsity: 39.1 | Dead Features: 0 

 16%|█▌        | 7704/48828 [11:52<1:03:43, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7708/48828 [11:52<1:03:48, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7710/48828 [11:52<1:03:53, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7714/48828 [11:53<1:03:24, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7716/48828 [11:53<1:03:15, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7720/48828 [11:53<1:03:03, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7722/48828 [11:54<1:03:20, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7726/48828 [11:54<1:03:31, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7728/48828 [11:54<1:03:09, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7732/48828 [11:54<1:02:46, 10.91it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7734/48828 [11:55<1:02:49, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7738/48828 [11:55<1:03:15, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7740/48828 [11:55<1:03:15, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7744/48828 [11:56<1:02:49, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7746/48828 [11:56<1:02:42, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7750/48828 [11:56<1:03:19, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7752/48828 [11:56<1:03:08, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7756/48828 [11:57<1:02:35, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7758/48828 [11:57<1:02:28, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7762/48828 [11:57<1:03:20, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7764/48828 [11:57<1:03:08, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7768/48828 [11:58<1:02:34, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7770/48828 [11:58<1:02:31, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7774/48828 [11:58<1:03:28, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7776/48828 [11:58<1:03:21, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7780/48828 [11:59<1:02:44, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7782/48828 [11:59<1:02:30, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7786/48828 [11:59<1:03:03, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7788/48828 [12:00<1:03:02, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7792/48828 [12:00<1:02:33, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7794/48828 [12:00<1:02:25, 10.96it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7798/48828 [12:01<1:03:06, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7800/48828 [12:01<1:03:13, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 44.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15974400 | Self Similarity: 0.13
Sparsity: 44.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15974400 | Self Similarity: 0.12
Sparsity: 44.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15974400 | Self Similarity: 0.13
Sparsity: 44.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15974400 | Self Similarity: 0.12
Sparsity: 44.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 15974400 | Self Similarity: 0.13
Sparsity: 44.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 16%|█▌        | 7804/48828 [12:01<1:03:55, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7806/48828 [12:01<1:03:15, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7810/48828 [12:02<1:02:45, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7812/48828 [12:02<1:03:06, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7816/48828 [12:02<1:02:47, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7818/48828 [12:02<1:02:28, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7822/48828 [12:03<1:02:24, 10.95it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7824/48828 [12:03<1:02:52, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7828/48828 [12:03<1:02:43, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7830/48828 [12:03<1:02:32, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7834/48828 [12:04<1:02:25, 10.94it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7836/48828 [12:04<1:02:56, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7840/48828 [12:04<1:02:48, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7842/48828 [12:05<1:02:32, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7846/48828 [12:05<1:02:30, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7848/48828 [12:05<1:02:51, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7852/48828 [12:05<1:02:40, 10.90it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7854/48828 [12:06<1:02:28, 10.93it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7858/48828 [12:06<1:02:33, 10.92it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7860/48828 [12:06<1:02:57, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7864/48828 [12:07<1:03:04, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7866/48828 [12:07<1:03:07, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7870/48828 [12:07<1:03:11, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7872/48828 [12:07<1:03:16, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7876/48828 [12:08<1:03:46, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7878/48828 [12:08<1:03:31, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7882/48828 [12:08<1:02:56, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7884/48828 [12:08<1:02:49, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7888/48828 [12:09<1:03:29, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7890/48828 [12:09<1:03:26, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7894/48828 [12:09<1:03:02, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7896/48828 [12:10<1:02:52, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7900/48828 [12:10<1:03:20, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7902/48828 [12:10<1:04:55, 10.51it/s]

Sparsity: 49.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16179200 | Self Similarity: 0.13
Sparsity: 49.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16179200 | Self Similarity: 0.13
Sparsity: 49.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16179200 | Self Similarity: 0.13
Sparsity: 48.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16179200 | Self Similarity: 0.12
Sparsity: 49.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16179200 | Self Similarity: 0.13
Sparsity: 49.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16179200 | Self Similarity: 0.12
Sparsity: 49.1 | Dead Features: 0 

 16%|█▌        | 7904/48828 [12:10<1:04:35, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7908/48828 [12:11<1:03:37, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7910/48828 [12:11<1:03:20, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7914/48828 [12:11<1:02:52, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7916/48828 [12:11<1:03:03, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7920/48828 [12:12<1:03:16, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7922/48828 [12:12<1:03:00, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7926/48828 [12:12<1:02:49, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7928/48828 [12:13<1:02:59, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7932/48828 [12:13<1:03:34, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▌        | 7934/48828 [12:13<1:03:17, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7938/48828 [12:13<1:03:16, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7940/48828 [12:14<1:03:05, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7944/48828 [12:14<1:03:35, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7946/48828 [12:14<1:03:46, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7950/48828 [12:15<1:03:10, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7952/48828 [12:15<1:02:57, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7956/48828 [12:15<1:03:13, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7958/48828 [12:15<1:03:27, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7962/48828 [12:16<1:03:32, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7964/48828 [12:16<1:03:07, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7968/48828 [12:16<1:02:48, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7970/48828 [12:16<1:03:01, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7974/48828 [12:17<1:03:38, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7976/48828 [12:17<1:03:22, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7980/48828 [12:17<1:02:53, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7982/48828 [12:18<1:02:52, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7986/48828 [12:18<1:03:15, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7988/48828 [12:18<1:03:25, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7992/48828 [12:18<1:02:53, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7994/48828 [12:19<1:02:40, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 7998/48828 [12:19<1:02:57, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8000/48828 [12:19<1:03:15, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 42.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16384000 | Self Similarity: 0.13
Sparsity: 42.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16384000 | Self Similarity: 0.13
Sparsity: 42.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16384000 | Self Similarity: 0.13
Sparsity: 42.8 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16384000 | Self Similarity: 0.12
Sparsity: 42.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16384000 | Self Similarity: 0.13
Sparsity: 42.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 16%|█▋        | 8004/48828 [12:20<1:04:15, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8006/48828 [12:20<1:03:41, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8010/48828 [12:20<1:03:00, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8012/48828 [12:20<1:02:55, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8016/48828 [12:21<1:03:25, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8018/48828 [12:21<1:03:22, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8022/48828 [12:21<1:02:47, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8024/48828 [12:21<1:02:42, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8028/48828 [12:22<1:03:20, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8030/48828 [12:22<1:03:33, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8034/48828 [12:22<1:03:02, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8036/48828 [12:23<1:02:54, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8040/48828 [12:23<1:02:40, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8042/48828 [12:23<1:03:10, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8046/48828 [12:23<1:03:08, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8048/48828 [12:24<1:03:27, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8052/48828 [12:24<1:02:52, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 16%|█▋        | 8054/48828 [12:24<1:02:47, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8058/48828 [12:25<1:03:11, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8060/48828 [12:25<1:03:00, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8064/48828 [12:25<1:02:36, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8066/48828 [12:25<1:02:39, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8070/48828 [12:26<1:03:29, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8072/48828 [12:26<1:03:35, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8076/48828 [12:26<1:03:04, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8078/48828 [12:26<1:02:51, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8082/48828 [12:27<1:02:51, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8084/48828 [12:27<1:03:01, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8088/48828 [12:27<1:02:57, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8090/48828 [12:28<1:02:44, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8094/48828 [12:28<1:02:49, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8096/48828 [12:28<1:02:52, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8100/48828 [12:29<1:03:04, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 38.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16588800 | Self Similarity: 0.13

 17%|█▋        | 8102/48828 [12:29<1:04:22, 10.54it/s]


Sparsity: 38.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16588800 | Self Similarity: 0.13
Sparsity: 38.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16588800 | Self Similarity: 0.13
Sparsity: 38.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16588800 | Self Similarity: 0.12
Sparsity: 38.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16588800 | Self Similarity: 0.13
Sparsity: 38.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16588800 | Self Similarity: 0.13
Sparsity: 38.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16588800 | Self Similarity: 0.13
Sparsity: 38.6 | Dead Features: 0

 17%|█▋        | 8104/48828 [12:29<1:03:48, 10.64it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8108/48828 [12:29<1:03:09, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8110/48828 [12:29<1:03:00, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8114/48828 [12:30<1:03:38, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8116/48828 [12:30<1:03:35, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8120/48828 [12:30<1:02:59, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8122/48828 [12:31<1:02:50, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8126/48828 [12:31<1:02:48, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8128/48828 [12:31<1:03:08, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8132/48828 [12:31<1:02:58, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8134/48828 [12:32<1:02:46, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8138/48828 [12:32<1:02:43, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8140/48828 [12:32<1:02:55, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8144/48828 [12:33<1:03:08, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8146/48828 [12:33<1:02:56, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8150/48828 [12:33<1:02:38, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8152/48828 [12:33<1:02:37, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8156/48828 [12:34<1:03:25, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8158/48828 [12:34<1:03:34, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8162/48828 [12:34<1:02:54, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8164/48828 [12:34<1:02:44, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8168/48828 [12:35<1:03:17, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8170/48828 [12:35<1:03:35, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8174/48828 [12:35<1:03:32, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8176/48828 [12:36<1:03:07, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8180/48828 [12:36<1:02:49, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8182/48828 [12:36<1:02:51, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8186/48828 [12:37<1:03:41, 10.63it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8188/48828 [12:37<1:03:33, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8192/48828 [12:37<1:03:05, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8194/48828 [12:37<1:02:58, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8198/48828 [12:38<1:02:47, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8200/48828 [12:38<1:03:14, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 40.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16793600 | Self Similarity: 0.13
Sparsity: 40.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16793600 | Self Similarity: 0.13
Sparsity: 40.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16793600 | Self Similarity: 0.13
Sparsity: 40.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16793600 | Self Similarity: 0.12
Sparsity: 40.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16793600 | Self Similarity: 0.13
Sparsity: 40.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 17%|█▋        | 8202/48828 [12:38<1:05:04, 10.41it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8206/48828 [12:38<1:04:02, 10.57it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8208/48828 [12:39<1:03:27, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8212/48828 [12:39<1:02:50, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8214/48828 [12:39<1:02:40, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8218/48828 [12:40<1:03:16, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8220/48828 [12:40<1:03:04, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8224/48828 [12:40<1:02:39, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8226/48828 [12:40<1:02:32, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8230/48828 [12:41<1:03:11, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8232/48828 [12:41<1:03:25, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8236/48828 [12:41<1:02:55, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8238/48828 [12:41<1:02:41, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8242/48828 [12:42<1:02:35, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8244/48828 [12:42<1:02:55, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8248/48828 [12:42<1:03:03, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8250/48828 [12:42<1:02:47, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8254/48828 [12:43<1:02:30, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8256/48828 [12:43<1:02:32, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8260/48828 [12:43<1:03:08, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8262/48828 [12:44<1:03:02, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8266/48828 [12:44<1:03:03, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8268/48828 [12:44<1:02:47, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8272/48828 [12:45<1:02:57, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8274/48828 [12:45<1:03:16, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8278/48828 [12:45<1:02:55, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8280/48828 [12:45<1:02:47, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8284/48828 [12:46<1:02:38, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8286/48828 [12:46<1:02:50, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8290/48828 [12:46<1:03:21, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8292/48828 [12:46<1:03:09, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8296/48828 [12:47<1:02:38, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8298/48828 [12:47<1:02:32, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8302/48828 [12:47<1:04:10, 10.53it/s]

representation shape is: torch.Size([8, 256, 512])
Sparsity: 41.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16998400 | Self Similarity: 0.13
Sparsity: 41.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16998400 | Self Similarity: 0.13
Sparsity: 41.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16998400 | Self Similarity: 0.13
Sparsity: 42.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16998400 | Self Similarity: 0.12
Sparsity: 41.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16998400 | Self Similarity: 0.13
Sparsity: 41.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 16998400 | Self 

 17%|█▋        | 8304/48828 [12:48<1:04:14, 10.51it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8306/48828 [12:48<1:03:58, 10.56it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8310/48828 [12:48<1:03:04, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8312/48828 [12:48<1:02:49, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8316/48828 [12:49<1:02:37, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8318/48828 [12:49<1:02:56, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8322/48828 [12:49<1:02:53, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8324/48828 [12:49<1:02:36, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8328/48828 [12:50<1:02:23, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8330/48828 [12:50<1:02:33, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8334/48828 [12:50<1:02:55, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8336/48828 [12:50<1:02:50, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8340/48828 [12:51<1:02:18, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8342/48828 [12:51<1:02:13, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8346/48828 [12:51<1:02:44, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8348/48828 [12:52<1:02:52, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8352/48828 [12:52<1:02:21, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8354/48828 [12:52<1:02:14, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8358/48828 [12:53<1:02:29, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8360/48828 [12:53<1:02:48, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8364/48828 [12:53<1:02:35, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8366/48828 [12:53<1:02:27, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8370/48828 [12:54<1:02:31, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8372/48828 [12:54<1:02:41, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8376/48828 [12:54<1:02:56, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8378/48828 [12:54<1:02:46, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8382/48828 [12:55<1:02:18, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8384/48828 [12:55<1:02:19, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8388/48828 [12:55<1:02:46, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8390/48828 [12:56<1:03:00, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8394/48828 [12:56<1:02:56, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8396/48828 [12:56<1:02:34, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8400/48828 [12:56<1:02:21, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8402/48828 [12:57<1:04:04, 10.52it/s]

Sparsity: 43.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17203200 | Self Similarity: 0.13
Sparsity: 43.0 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17203200 | Self Similarity: 0.13
Sparsity: 43.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17203200 | Self Similarity: 0.13
Sparsity: 42.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17203200 | Self Similarity: 0.13
Sparsity: 43.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17203200 | Self Similarity: 0.13
Sparsity: 43.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17203200 | Self Similarity: 0.13
Sparsity: 43.1 | Dead Features: 0 

 17%|█▋        | 8404/48828 [12:57<1:03:54, 10.54it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8408/48828 [12:57<1:03:37, 10.59it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8410/48828 [12:57<1:03:07, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8414/48828 [12:58<1:02:12, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8416/48828 [12:58<1:02:13, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8420/48828 [12:58<1:02:50, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8422/48828 [12:58<1:02:40, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8426/48828 [12:59<1:02:03, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8428/48828 [12:59<1:01:54, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8432/48828 [12:59<1:02:28, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8434/48828 [13:00<1:02:34, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8438/48828 [13:00<1:02:12, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8440/48828 [13:00<1:01:59, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8444/48828 [13:01<1:02:21, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8446/48828 [13:01<1:02:45, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8450/48828 [13:01<1:02:28, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8452/48828 [13:01<1:02:12, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8456/48828 [13:02<1:01:59, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8458/48828 [13:02<1:02:16, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8462/48828 [13:02<1:02:36, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8464/48828 [13:02<1:02:18, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8468/48828 [13:03<1:01:50, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8470/48828 [13:03<1:01:59, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8474/48828 [13:03<1:02:32, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8476/48828 [13:03<1:02:31, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8480/48828 [13:04<1:02:00, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8482/48828 [13:04<1:01:47, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8486/48828 [13:04<1:02:37, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8488/48828 [13:05<1:02:49, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8492/48828 [13:05<1:02:36, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8494/48828 [13:05<1:02:34, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8498/48828 [13:06<1:02:15, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8500/48828 [13:06<1:02:26, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 40.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17408000 | Self Similarity: 0.13
Sparsity: 39.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17408000 | Self Similarity: 0.13
Sparsity: 39.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17408000 | Self Similarity: 0.13
Sparsity: 39.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17408000 | Self Similarity: 0.13
Sparsity: 39.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17408000 | Self Similarity: 0.13
Sparsity: 39.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 17%|█▋        | 8502/48828 [13:06<1:03:56, 10.51it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8506/48828 [13:06<1:03:17, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8508/48828 [13:06<1:03:06, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8512/48828 [13:07<1:02:20, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8514/48828 [13:07<1:02:10, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8518/48828 [13:07<1:02:29, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8520/48828 [13:08<1:02:20, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8524/48828 [13:08<1:01:49, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8526/48828 [13:08<1:01:42, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8530/48828 [13:09<1:02:28, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8532/48828 [13:09<1:02:29, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8536/48828 [13:09<1:01:57, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8538/48828 [13:09<1:01:46, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8542/48828 [13:10<1:02:00, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 17%|█▋        | 8544/48828 [13:10<1:02:21, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8548/48828 [13:10<1:01:49, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8550/48828 [13:10<1:01:39, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8554/48828 [13:11<1:01:48, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8556/48828 [13:11<1:02:15, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8560/48828 [13:11<1:02:03, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8562/48828 [13:11<1:01:48, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8566/48828 [13:12<1:01:46, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8568/48828 [13:12<1:02:04, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8572/48828 [13:12<1:02:16, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8574/48828 [13:13<1:02:06, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8578/48828 [13:13<1:01:53, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8580/48828 [13:13<1:02:01, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8584/48828 [13:14<1:02:13, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8586/48828 [13:14<1:02:05, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8590/48828 [13:14<1:02:14, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8592/48828 [13:14<1:02:29, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8596/48828 [13:15<1:02:29, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8598/48828 [13:15<1:03:08, 10.62it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8602/48828 [13:15<1:03:33, 10.55it/s]

representation shape is: torch.Size([8, 256, 512])
Sparsity: 40.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17612800 | Self Similarity: 0.13
Sparsity: 40.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17612800 | Self Similarity: 0.13
Sparsity: 40.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17612800 | Self Similarity: 0.13
Sparsity: 40.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17612800 | Self Similarity: 0.13
Sparsity: 40.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17612800 | Self Similarity: 0.13
Sparsity: 40.9 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17612800 | Self 

 18%|█▊        | 8604/48828 [13:15<1:02:54, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8608/48828 [13:16<1:02:07, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8610/48828 [13:16<1:02:13, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8614/48828 [13:16<1:02:17, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8616/48828 [13:16<1:02:00, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8620/48828 [13:17<1:01:41, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8622/48828 [13:17<1:01:39, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8626/48828 [13:17<1:02:28, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8628/48828 [13:18<1:02:15, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8632/48828 [13:18<1:02:01, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8634/48828 [13:18<1:01:45, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8638/48828 [13:19<1:01:58, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8640/48828 [13:19<1:02:07, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8644/48828 [13:19<1:01:47, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8646/48828 [13:19<1:01:40, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8650/48828 [13:20<1:02:10, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8652/48828 [13:20<1:02:32, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8656/48828 [13:20<1:02:17, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8658/48828 [13:20<1:02:04, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8662/48828 [13:21<1:01:58, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8664/48828 [13:21<1:02:16, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8668/48828 [13:21<1:02:32, 10.70it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8670/48828 [13:21<1:02:17, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8674/48828 [13:22<1:01:54, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8676/48828 [13:22<1:01:44, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8680/48828 [13:22<1:02:11, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8682/48828 [13:23<1:02:19, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8686/48828 [13:23<1:02:07, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8688/48828 [13:23<1:01:55, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8692/48828 [13:24<1:02:01, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8694/48828 [13:24<1:02:06, 10.77it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8698/48828 [13:24<1:02:02, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8700/48828 [13:24<1:01:56, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 37.1 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17817600 | Self Similarity: 0.13
Sparsity: 37.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17817600 | Self Similarity: 0.13
Sparsity: 37.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17817600 | Self Similarity: 0.13
Sparsity: 37.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17817600 | Self Similarity: 0.13
Sparsity: 37.2 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 17817600 | Self Similarity: 0.13
Sparsity: 37.4 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0

 18%|█▊        | 8704/48828 [13:25<1:02:34, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8706/48828 [13:25<1:02:24, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8710/48828 [13:25<1:02:40, 10.67it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8712/48828 [13:25<1:02:32, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8716/48828 [13:26<1:01:57, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8718/48828 [13:26<1:01:49, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8722/48828 [13:26<1:02:25, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8724/48828 [13:27<1:02:45, 10.65it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8728/48828 [13:27<1:02:23, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8730/48828 [13:27<1:02:10, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8734/48828 [13:27<1:01:49, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8736/48828 [13:28<1:02:00, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8740/48828 [13:28<1:02:22, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8742/48828 [13:28<1:02:13, 10.74it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8746/48828 [13:29<1:01:56, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8748/48828 [13:29<1:01:54, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8752/48828 [13:29<1:02:14, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8754/48828 [13:29<1:02:31, 10.68it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8758/48828 [13:30<1:01:56, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8760/48828 [13:30<1:01:42, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8764/48828 [13:30<1:01:36, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8766/48828 [13:30<1:01:56, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8770/48828 [13:31<1:01:56, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8772/48828 [13:31<1:01:53, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8776/48828 [13:31<1:01:29, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8778/48828 [13:32<1:01:41, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8782/48828 [13:32<1:02:03, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8784/48828 [13:32<1:01:47, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8788/48828 [13:32<1:01:21, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8790/48828 [13:33<1:01:18, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8794/48828 [13:33<1:02:04, 10.75it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8796/48828 [13:33<1:01:54, 10.78it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8800/48828 [13:34<1:01:20, 10.88it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
Sparsity: 38.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 18022400 | Self Similarity: 0.13


 18%|█▊        | 8802/48828 [13:34<1:03:03, 10.58it/s]

Sparsity: 38.5 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 18022400 | Self Similarity: 0.13
Sparsity: 38.7 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 18022400 | Self Similarity: 0.13
Sparsity: 38.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 18022400 | Self Similarity: 0.13
Sparsity: 38.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 18022400 | Self Similarity: 0.13
Sparsity: 38.6 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 18022400 | Self Similarity: 0.13
Sparsity: 38.3 | Dead Features: 0 | Total Loss: 0.02 | Reconstruction Loss: 0.01 | L1 Loss: 0.01 | l1_alpha: 8.00e-04 | Tokens: 18022400 | Self Similarity: 0.13
Sparsity: 39.0 | Dead Features: 0 

 18%|█▊        | 8804/48828 [13:34<1:02:36, 10.66it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8808/48828 [13:34<1:02:23, 10.69it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8810/48828 [13:34<1:02:12, 10.72it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8814/48828 [13:35<1:01:37, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8816/48828 [13:35<1:01:24, 10.86it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8820/48828 [13:35<1:01:46, 10.79it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8822/48828 [13:36<1:01:56, 10.76it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8826/48828 [13:36<1:01:36, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8828/48828 [13:36<1:01:25, 10.85it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8832/48828 [13:37<1:01:28, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8834/48828 [13:37<1:01:40, 10.81it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8838/48828 [13:37<1:01:33, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8840/48828 [13:37<1:01:19, 10.87it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8844/48828 [13:38<1:01:12, 10.89it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8846/48828 [13:38<1:01:29, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8850/48828 [13:38<1:01:42, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8852/48828 [13:38<1:01:32, 10.83it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8856/48828 [13:39<1:01:40, 10.80it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8858/48828 [13:39<1:01:34, 10.82it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8862/48828 [13:39<1:02:11, 10.71it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8864/48828 [13:39<1:02:03, 10.73it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8868/48828 [13:40<1:01:26, 10.84it/s]

representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])
representation shape is: torch.Size([8, 256, 512])


 18%|█▊        | 8870/48828 [13:40<1:01:16, 10.87it/s]

In [None]:
model_save_name = cfg.model_name.split("/")[-1]
save_name = f"{model_save_name}_sp{cfg.sparsity}_r{cfg.ratio}_{tensor_names[0]}"  # trim year

# Make directory traiend_models if it doesn't exist
import os
if not os.path.exists("trained_models"):
    os.makedirs("trained_models")
# Save model
torch.save(autoencoder, f"trained_models/{save_name}.pt")

In [None]:
wandb.finish()