In [1]:
import torch 
import argparse
from utils import dotdict
from activation_dataset import setup_token_data
import wandb
import json
from datetime import datetime
from tqdm import tqdm
from einops import rearrange
import matplotlib.pyplot as plt
# from standard_metrics import run_with_model_intervention, perplexity_under_reconstruction, mean_nonzero_activations
# Create 
# # make an argument parser directly below
# parser = argparse.ArgumentParser()
# parser.add_argument("--model_name", type=str, default="EleutherAI/pythia-70m-deduped")
# parser.add_argument("--layer", type=int, default=4)
# parser.add_argument("--setting", type=str, default="residual")
# parser.add_argument("--l1_alpha", type=float, default=3e-3)
# parser.add_argument("--num_epochs", type=int, default=10)
# parser.add_argument("--model_batch_size", type=int, default=4)
# parser.add_argument("--lr", type=float, default=1e-3)
# parser.add_argument("--kl", type=bool, default=False)
# parser.add_argument("--reconstruction", type=bool, default=False)
# parser.add_argument("--dataset_name", type=str, default="NeelNanda/pile-10k")
# parser.add_argument("--device", type=str, default="cuda:4")

# args = parser.parse_args()
cfg = dotdict()
# cfg.model_name="EleutherAI/pythia-70m-deduped", "usvsnsp/pythia-6.9b-sft"
cfg.model_name="EleutherAI/pythia-70m-deduped"
cfg.target_name="EleutherAI/pythia-70m-deduped"
cfg.layers=[4]
cfg.setting="residual"
cfg.tensor_name="gpt_neox.layers.{layer}"
original_l1_alpha = 1e-3
cfg.l1_alpha=original_l1_alpha
cfg.sparsity=None
cfg.num_epochs=10
cfg.model_batch_size=4
cfg.lr=1e-3
cfg.kl=False
cfg.reconstruction=False
# cfg.dataset_name="NeelNanda/pile-10k"
cfg.dataset_name="Elriggs/openwebtext-100k"
cfg.device="cuda:0"
cfg.ratio = 4
cfg.seed = 0
# cfg.device="cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tensor_names = [cfg.tensor_name.format(layer=layer) for layer in cfg.layers]

In [3]:
# Load in the model
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(cfg.model_name)
model = model.to(cfg.device)
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

In [4]:
# Download the dataset
# TODO iteratively grab dataset?
cfg.max_length = 256
cfg.model_batch_size = 4
token_loader = setup_token_data(cfg, tokenizer, model, seed=cfg.seed)
num_tokens = cfg.max_length*cfg.model_batch_size*len(token_loader)
print(f"Number of tokens: {num_tokens}")

Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow


Number of tokens: 112749568


In [5]:
# Run 1 datapoint on model to get the activation size
from baukit import Trace

text = "1"
tokens = tokenizer(text, return_tensors="pt").input_ids.to(cfg.device)
# Your activation name will be different. In the next cells, we will show you how to find it.
with torch.no_grad():
    with Trace(model, tensor_names[0]) as ret:
        _ = model(tokens)
        representation = ret.output
        # check if instance tuple
        if(isinstance(representation, tuple)):
            representation = representation[0]
        activation_size = representation.shape[-1]
print(f"Activation size: {activation_size}")

Activation size: 512


In [6]:
# Initialize New autoencoder
from autoencoders.learned_dict import TiedSAE, UntiedSAE, AnthropicSAE
from torch import nn
params = dict()
n_dict_components = activation_size*cfg.ratio
params["encoder"] = torch.empty((n_dict_components, activation_size), device=cfg.device)
nn.init.xavier_uniform_(params["encoder"])

params["decoder"] = torch.empty((n_dict_components, activation_size), device=cfg.device)
nn.init.xavier_uniform_(params["decoder"])

params["encoder_bias"] = torch.empty((n_dict_components,), device=cfg.device)
nn.init.zeros_(params["encoder_bias"])

params["shift_bias"] = torch.empty((activation_size,), device=cfg.device)
nn.init.zeros_(params["shift_bias"])

autoencoder = AnthropicSAE(  # TiedSAE, UntiedSAE, AnthropicSAE
    # n_feats = n_dict_components, 
    # activation_size=activation_size,
    encoder=params["encoder"],
    encoder_bias=params["encoder_bias"],
    decoder=params["decoder"],
    shift_bias=params["shift_bias"],
)
autoencoder.to_device(cfg.device)
autoencoder.set_grad()
# autoencoder.encoder.requires_grad = True
# autoencoder.encoder_bias.requires_grad = True
# autoencoder.decoder.requires_grad = True
# autoencoder.shift_bias.requires_grad = True
optimizer = torch.optim.Adam(
    [
        autoencoder.encoder, 
        autoencoder.encoder_bias,
        autoencoder.decoder,
        autoencoder.shift_bias,
    ], lr=cfg.lr)

In [7]:
# Set target sparsity to 10% of activation_size if not set
if cfg.sparsity is None:
    cfg.sparsity = int(activation_size*0.1)
    print(f"Target sparsity: {cfg.sparsity}")

target_lower_sparsity = cfg.sparsity - 5.0
target_upper_sparsity = cfg.sparsity + 5.0
adjustment_factor = 0.1  # You can set this to whatever you like

Target sparsity: 51


In [8]:
original_bias = autoencoder.encoder_bias.clone().detach()
# Wandb setup
secrets = json.load(open("secrets.json"))
wandb.login(key=secrets["wandb_key"])
start_time = datetime.now().strftime("%Y%m%d-%H%M%S")
wandb_run_name = f"{cfg.model_name}_{start_time[4:]}_{cfg.sparsity}"  # trim year
print(f"wandb_run_name: {wandb_run_name}")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mbenw8888[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


wandb_run_name: EleutherAI/pythia-70m-deduped_1011-032433_51


In [14]:
wandb.init(project="sparse coding", config=dict(cfg), name=wandb_run_name)

In [13]:
# import os
# class ActivationSaver():    
#     def __init__(self, batches_per_file=5_000, folder_path="saved_activations"):
#         self.total_batches = 0
#         self.batches_per_file = batches_per_file
#         self.folder_path = folder_path
        
#         self.files = []
#         self.batch_buffer = []
#         if not os.path.exists(folder_path):
#             os.makedirs(folder_path)
            
#     def save_batch(self, batch):
#         if len(self.batch_buffer) >= self.batches_per_file:
#             new_file = f"{self.folder_path}/activations_{len(self.files)+1}.pkl"
#             torch.save(self.batch_buffer, new_file)
#             self.batch_buffer = []
#             self.files.append(new_file)
            
#         self.batch_buffer.append(batch)
#         self.total_batches +=1
    
#     def save_buffer(self):
#         new_file = f"{self.folder_path}/activations_{len(self.files)+1}.pkl"
#         torch.save(self.batch_buffer, new_file)
#         self.batch_buffer = []
    
#     def __len__(self):
#         return self.total_batches
        
#     def load_file(self, i):
#         return torch.load(self.files[i])
    
#     def generator(self):
#         cur_file = 0
#         while cur_file < len(self.files):
#             for batch in self.load_file(cur_file):
#                 yield batch
#             cur_file += 1
#         for batch in self.batch_buffer:
#             yield batch
# activation_saver = ActivationSaver(batches_per_file=5_000, folder_path="saved_activations/base")

In [15]:
dead_features = torch.zeros(autoencoder.encoder.shape[0])
total_activations = torch.zeros(autoencoder.encoder.shape[0])
max_num_tokens = 100_000_000
# Freeze model parameters 
model.eval()
model.requires_grad_(False)
model.to(cfg.device)
last_encoder = autoencoder.encoder.clone().detach()
for i, batch in enumerate(tqdm(token_loader)):
    tokens = batch["input_ids"].to(cfg.device)
    with torch.no_grad(): # As long as not doing KL divergence, don't need gradients for model
        with Trace(model, tensor_names[0]) as ret:
            _ = model(tokens)
            representation = ret.output
            if(isinstance(representation, tuple)):
                representation = representation[0]
    layer_activations = rearrange(representation, "b seq d_model -> (b seq) d_model")
    # activation_saver.save_batch(layer_activations.clone().cpu().detach())

    c = autoencoder.encode(layer_activations)
    x_hat = autoencoder.decode(c)
    
    reconstruction_loss = (x_hat - layer_activations).pow(2).mean()
    l1_loss = torch.norm(c, 1, dim=-1).mean()
    total_loss = reconstruction_loss + cfg.l1_alpha*l1_loss

    dead_features += c.sum(dim=0).cpu()
    total_activations += c.sum(dim=0).cpu()
    if (i % 500 == 0): # Check here so first check is model w/o change
        # self_similarity = torch.cosine_similarity(c, last_encoder, dim=-1).mean().cpu().item()
        # Above is wrong, should be similarity between encoder and last encoder
        self_similarity = torch.cosine_similarity(autoencoder.encoder, last_encoder, dim=-1).mean().cpu().item()
        last_encoder = autoencoder.encoder.clone().detach()
        num_tokens_so_far = i*cfg.max_length*cfg.model_batch_size
        with torch.no_grad():
            sparsity = (c != 0).float().mean(dim=0).sum().cpu().item()
            # Count number of dead_features are zero
            num_dead_features = (dead_features == 0).sum().item()
        print(f"Sparsity: {sparsity:.1f} | Dead Features: {num_dead_features} | Total Loss: {total_loss:.2f} | Reconstruction Loss: {reconstruction_loss:.2f} | L1 Loss: {cfg.l1_alpha*l1_loss:.2f} | l1_alpha: {cfg.l1_alpha:.2e} | Tokens: {num_tokens_so_far} | Self Similarity: {self_similarity:.2f}")
        wandb.log({
            'Sparsity': sparsity,
            'Dead Features': num_dead_features,
            'Total Loss': total_loss.item(),
            'Reconstruction Loss': reconstruction_loss.item(),
            'L1 Loss': (cfg.l1_alpha*l1_loss).item(),
            'l1_alpha': cfg.l1_alpha,
            'Tokens': num_tokens_so_far,
            'Self Similarity': self_similarity
        })
        
        dead_features = torch.zeros(autoencoder.encoder.shape[0])
        
        if(num_tokens_so_far > max_num_tokens):
            print(f"Reached max number of tokens: {max_num_tokens}")
            break
    
    resample_period = 500
    if (i % resample_period == 0):
        # RESAMPLING
        with torch.no_grad():
            # Count number of dead_features are zero
            num_dead_features = (total_activations == 0).sum().item()
            
        if num_dead_features > 0:
            # hyperparams:
            max_resample_tokens = 1000 # the number of token activations that we consider for inserting into the dictionary
            # compute loss of model on random subset of inputs
            resample_loader = setup_token_data(cfg, tokenizer, model, seed=i)
            num_resample_data = 0

            resample_activations = torch.empty(0, activation_size)
            resample_losses = torch.empty(0)

            for resample_batch in resample_loader:
                resample_tokens = resample_batch["input_ids"].to(cfg.device)
                with torch.no_grad(): # As long as not doing KL divergence, don't need gradients for model
                    with Trace(model, tensor_names[0]) as ret:
                        _ = model(resample_tokens)
                        representation = ret.output
                        if(isinstance(representation, tuple)):
                            representation = representation[0]
                layer_activations = rearrange(representation, "b seq d_model -> (b seq) d_model")
                resample_activations = torch.cat((resample_activations, layer_activations.detach().cpu()), dim=0)

                c = autoencoder.encode(layer_activations)
                x_hat = autoencoder.decode(c)
                
                reconstruction_loss = (x_hat - layer_activations).pow(2).mean(dim=-1)
                l1_loss = torch.norm(c, 1, dim=-1)
                temp_loss = reconstruction_loss + cfg.l1_alpha*l1_loss
                
                resample_losses = torch.cat((resample_losses, temp_loss.detach().cpu()), dim=0)
                
                num_resample_data +=layer_activations.shape[0]
                if num_resample_data > max_resample_tokens:
                    break

                
            # sample num_dead_features vectors of input activations
            probabilities = resample_losses**2
            sampled_indices = torch.multinomial(probabilities, num_dead_features)
            new_vectors = resample_activations[sampled_indices]

            # calculate average encoder norm of alive neurons
            alive_neurons = list((total_activations!=0))
            modified_columns = total_activations==0
            avg_norm = autoencoder.encoder.data[alive_neurons].norm(dim=-1).mean()

            # replace dictionary and encoder weights with vectors
            new_vectors = new_vectors / new_vectors.norm(dim=1, keepdim=True)
            
            params_to_modify = [autoencoder.encoder, autoencoder.encoder_bias]

            current_weights = autoencoder.encoder.data
            current_weights[modified_columns] = (new_vectors.to(cfg.device) * avg_norm * 0.2)
            autoencoder.encoder.data = current_weights

            current_weights = autoencoder.encoder_bias.data
            current_weights[modified_columns] = 0
            autoencoder.encoder_bias.data = current_weights
            
            if hasattr(autoencoder, 'decoder'):
                current_weights = autoencoder.decoder.data
                current_weights[modified_columns] = new_vectors.to(cfg.device)
                autoencoder.decoder.data = current_weights
                params_to_modify += [autoencoder.decoder]

            for param_group in optimizer.param_groups:
                for param in param_group['params']:
                    if any(param is d_ for d_ in params_to_modify):
                        # Extract the corresponding rows from m and v
                        m = optimizer.state[param]['exp_avg']
                        v = optimizer.state[param]['exp_avg_sq']
                        
                        # Update the m and v values for the modified columns
                        m[modified_columns] = 0  # Reset moving average for modified columns
                        v[modified_columns] = 0  # Reset squared moving average for modified columns
        
        total_activations = torch.zeros(autoencoder.encoder.shape[0])

    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

    # # Running sparsity check
    # if(num_tokens_so_far > 500000):
    #     if(i % 200 == 0):
    #         with torch.no_grad():
    #             sparsity = (c != 0).float().mean(dim=0).sum().cpu().item()
    #         if sparsity > target_upper_sparsity:
    #             cfg.l1_alpha *= (1 + adjustment_factor)
    #         elif sparsity < target_lower_sparsity:
    #             cfg.l1_alpha *= (1 - adjustment_factor)
    #         # print(f"Sparsity: {sparsity:.1f} | l1_alpha: {cfg.l1_alpha:.2e}")

  0%|          | 3/110107 [00:00<1:02:59, 29.13it/s]

  0%|          | 8/110107 [00:00<47:14, 38.84it/s]  

Sparsity: 1016.3 | Dead Features: 0 | Total Loss: 1.56 | Reconstruction Loss: 1.19 | L1 Loss: 0.37 | l1_alpha: 1.00e-03 | Tokens: 0 | Self Similarity: 1.00


  0%|          | 508/110107 [00:11<40:26, 45.16it/s]

Sparsity: 145.7 | Dead Features: 0 | Total Loss: 0.14 | Reconstruction Loss: 0.09 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 512000 | Self Similarity: 0.79


  1%|          | 1008/110107 [00:22<40:22, 45.03it/s]

Sparsity: 151.5 | Dead Features: 0 | Total Loss: 0.13 | Reconstruction Loss: 0.07 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 1024000 | Self Similarity: 0.95


  1%|▏         | 1498/110107 [00:32<39:58, 45.28it/s]

Sparsity: 143.8 | Dead Features: 1 | Total Loss: 0.12 | Reconstruction Loss: 0.07 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 1536000 | Self Similarity: 0.96


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
  2%|▏         | 2008/110107 [00:46<40:00, 45.02it/s]  

Sparsity: 144.5 | Dead Features: 0 | Total Loss: 0.11 | Reconstruction Loss: 0.06 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 2048000 | Self Similarity: 0.96


  2%|▏         | 2508/110107 [00:57<39:50, 45.01it/s]

Sparsity: 145.2 | Dead Features: 0 | Total Loss: 0.11 | Reconstruction Loss: 0.06 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 2560000 | Self Similarity: 0.96


  3%|▎         | 3008/110107 [01:08<39:39, 45.00it/s]

Sparsity: 147.7 | Dead Features: 0 | Total Loss: 0.10 | Reconstruction Loss: 0.05 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 3072000 | Self Similarity: 0.96


  3%|▎         | 3508/110107 [01:19<39:31, 44.94it/s]

Sparsity: 151.3 | Dead Features: 0 | Total Loss: 0.10 | Reconstruction Loss: 0.05 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 3584000 | Self Similarity: 0.96


  4%|▎         | 4008/110107 [01:30<39:23, 44.88it/s]

Sparsity: 146.1 | Dead Features: 0 | Total Loss: 0.10 | Reconstruction Loss: 0.05 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 4096000 | Self Similarity: 0.96


  4%|▍         | 4498/110107 [01:41<38:51, 45.30it/s]

Sparsity: 137.3 | Dead Features: 1 | Total Loss: 0.10 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 4608000 | Self Similarity: 0.95


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
  5%|▍         | 5008/110107 [01:59<6:50:31,  4.27it/s]

Sparsity: 140.5 | Dead Features: 0 | Total Loss: 0.10 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 5120000 | Self Similarity: 0.96


  5%|▍         | 5498/110107 [02:10<37:42, 46.23it/s]  

Sparsity: 146.2 | Dead Features: 1 | Total Loss: 0.10 | Reconstruction Loss: 0.05 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 5632000 | Self Similarity: 0.96


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
  5%|▌         | 5998/110107 [02:23<37:34, 46.18it/s]  

Sparsity: 140.2 | Dead Features: 5 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 6144000 | Self Similarity: 0.96


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
  6%|▌         | 6498/110107 [02:36<37:25, 46.14it/s]  

Sparsity: 144.2 | Dead Features: 2 | Total Loss: 0.10 | Reconstruction Loss: 0.06 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 6656000 | Self Similarity: 0.96


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
  6%|▋         | 6998/110107 [02:48<37:12, 46.18it/s]  

Sparsity: 143.0 | Dead Features: 4 | Total Loss: 0.10 | Reconstruction Loss: 0.05 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 7168000 | Self Similarity: 0.96


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
  7%|▋         | 7498/110107 [03:01<36:56, 46.30it/s]  

Sparsity: 141.5 | Dead Features: 12 | Total Loss: 0.10 | Reconstruction Loss: 0.05 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 7680000 | Self Similarity: 0.95


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
  7%|▋         | 7998/110107 [03:14<36:53, 46.12it/s]  

Sparsity: 142.6 | Dead Features: 5 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 8192000 | Self Similarity: 0.96


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
  8%|▊         | 8498/110107 [03:27<36:39, 46.19it/s]  

Sparsity: 143.2 | Dead Features: 5 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 8704000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
  8%|▊         | 8998/110107 [03:40<36:30, 46.16it/s]  

Sparsity: 144.4 | Dead Features: 14 | Total Loss: 0.10 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 9216000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
  9%|▊         | 9498/110107 [03:52<36:20, 46.14it/s]  

Sparsity: 134.0 | Dead Features: 53 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 9728000 | Self Similarity: 0.96


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
  9%|▉         | 9998/110107 [04:05<36:45, 45.39it/s]  

Sparsity: 130.9 | Dead Features: 48 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 10240000 | Self Similarity: 0.96


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 10%|▉         | 10498/110107 [04:24<35:55, 46.21it/s]   

Sparsity: 140.0 | Dead Features: 22 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 10752000 | Self Similarity: 0.96


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 10%|▉         | 10998/110107 [04:36<35:44, 46.21it/s]  

Sparsity: 141.9 | Dead Features: 25 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 11264000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 10%|█         | 11498/110107 [04:49<35:32, 46.24it/s]  

Sparsity: 139.7 | Dead Features: 8 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 11776000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 11%|█         | 11998/110107 [05:02<35:22, 46.22it/s]  

Sparsity: 145.3 | Dead Features: 17 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 12288000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 11%|█▏        | 12498/110107 [05:15<35:15, 46.13it/s]  

Sparsity: 145.4 | Dead Features: 12 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 12800000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 12%|█▏        | 12998/110107 [05:28<35:00, 46.23it/s]  

Sparsity: 141.3 | Dead Features: 9 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 13312000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 12%|█▏        | 13498/110107 [05:40<34:51, 46.20it/s]  

Sparsity: 136.2 | Dead Features: 2 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 13824000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 13%|█▎        | 13998/110107 [05:53<34:39, 46.22it/s]  

Sparsity: 148.0 | Dead Features: 3 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 14336000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 13%|█▎        | 14498/110107 [06:06<34:29, 46.20it/s]  

Sparsity: 139.4 | Dead Features: 3 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 14848000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 14%|█▎        | 14998/110107 [06:30<34:20, 46.15it/s]  

Sparsity: 144.7 | Dead Features: 3 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 15360000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 14%|█▍        | 15496/110107 [06:43<34:10, 46.14it/s]   

Sparsity: 144.7 | Dead Features: 8 | Total Loss: 0.11 | Reconstruction Loss: 0.06 | L1 Loss: 0.05 | l1_alpha: 1.00e-03 | Tokens: 15872000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 15%|█▍        | 15996/110107 [06:56<33:51, 46.34it/s]  

Sparsity: 147.0 | Dead Features: 5 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 16384000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 15%|█▍        | 16506/110107 [07:09<33:57, 45.95it/s]  

Sparsity: 141.7 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 16896000 | Self Similarity: 0.98


 15%|█▌        | 16996/110107 [07:20<33:36, 46.18it/s]

Sparsity: 135.7 | Dead Features: 3 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 17408000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 16%|█▌        | 17496/110107 [07:33<33:30, 46.07it/s]  

Sparsity: 147.5 | Dead Features: 1 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 17920000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 16%|█▋        | 17996/110107 [07:45<33:16, 46.14it/s]  

Sparsity: 128.0 | Dead Features: 11 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 18432000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 17%|█▋        | 18496/110107 [07:58<33:04, 46.16it/s]  

Sparsity: 130.1 | Dead Features: 1 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 18944000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 17%|█▋        | 19006/110107 [08:11<33:19, 45.56it/s]  

Sparsity: 135.6 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 19456000 | Self Similarity: 0.98


 18%|█▊        | 19506/110107 [08:21<33:09, 45.55it/s]

Sparsity: 139.3 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 19968000 | Self Similarity: 0.98


 18%|█▊        | 19996/110107 [08:32<32:26, 46.30it/s]

Sparsity: 141.2 | Dead Features: 1 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 20480000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 19%|█▊        | 20506/110107 [08:50<32:25, 46.07it/s]   

Sparsity: 138.8 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 20992000 | Self Similarity: 0.97


 19%|█▉        | 20996/110107 [09:01<31:58, 46.46it/s]

Sparsity: 141.6 | Dead Features: 5 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 21504000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 20%|█▉        | 21496/110107 [09:14<31:53, 46.30it/s]  

Sparsity: 139.1 | Dead Features: 3 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 22016000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 20%|█▉        | 21996/110107 [09:27<31:35, 46.48it/s]  

Sparsity: 138.9 | Dead Features: 1 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 22528000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 20%|██        | 22506/110107 [09:39<31:52, 45.80it/s]  

Sparsity: 133.2 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 23040000 | Self Similarity: 0.96


 21%|██        | 22996/110107 [09:50<31:24, 46.23it/s]

Sparsity: 133.6 | Dead Features: 33 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 23552000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 21%|██▏       | 23496/110107 [10:03<31:10, 46.31it/s]  

Sparsity: 133.6 | Dead Features: 18 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 24064000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 22%|██▏       | 23996/110107 [10:15<30:59, 46.32it/s]  

Sparsity: 140.8 | Dead Features: 9 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 24576000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 22%|██▏       | 24496/110107 [10:28<30:47, 46.33it/s]  

Sparsity: 137.0 | Dead Features: 3 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 25088000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 23%|██▎       | 24996/110107 [10:41<30:41, 46.23it/s]  

Sparsity: 137.0 | Dead Features: 2 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 25600000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 23%|██▎       | 25496/110107 [10:59<30:24, 46.38it/s]   

Sparsity: 140.8 | Dead Features: 1 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 26112000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 24%|██▎       | 26006/110107 [11:12<30:28, 46.00it/s]  

Sparsity: 133.4 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 26624000 | Self Similarity: 0.97


 24%|██▍       | 26506/110107 [11:23<30:25, 45.79it/s]

Sparsity: 137.3 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 27136000 | Self Similarity: 0.98


 25%|██▍       | 27006/110107 [11:34<30:09, 45.92it/s]

Sparsity: 147.1 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 27648000 | Self Similarity: 0.98


 25%|██▍       | 27506/110107 [11:45<30:00, 45.87it/s]

Sparsity: 139.5 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 28160000 | Self Similarity: 0.98


 25%|██▌       | 28006/110107 [11:56<29:49, 45.88it/s]

Sparsity: 143.6 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 28672000 | Self Similarity: 0.98


 26%|██▌       | 28506/110107 [12:06<29:43, 45.77it/s]

Sparsity: 141.4 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 29184000 | Self Similarity: 0.98


 26%|██▋       | 28996/110107 [12:17<29:24, 45.97it/s]

Sparsity: 137.7 | Dead Features: 1 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 29696000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 27%|██▋       | 29506/110107 [12:30<29:15, 45.91it/s]  

Sparsity: 133.1 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 30208000 | Self Similarity: 0.96


 27%|██▋       | 29996/110107 [12:41<28:48, 46.34it/s]

Sparsity: 136.2 | Dead Features: 2 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 30720000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 28%|██▊       | 30506/110107 [13:00<28:53, 45.92it/s]   

Sparsity: 143.8 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 31232000 | Self Similarity: 0.98


 28%|██▊       | 31006/110107 [13:11<28:56, 45.56it/s]

Sparsity: 136.4 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 31744000 | Self Similarity: 0.98


 29%|██▊       | 31506/110107 [13:21<28:34, 45.85it/s]

Sparsity: 140.3 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 32256000 | Self Similarity: 0.97


 29%|██▉       | 32006/110107 [13:32<28:20, 45.93it/s]

Sparsity: 141.4 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 32768000 | Self Similarity: 0.97


 30%|██▉       | 32496/110107 [13:43<27:53, 46.37it/s]

Sparsity: 141.6 | Dead Features: 1 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 33280000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 30%|██▉       | 32996/110107 [13:55<27:46, 46.27it/s]  

Sparsity: 132.6 | Dead Features: 1 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 33792000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 30%|███       | 33506/110107 [14:08<28:04, 45.47it/s]  

Sparsity: 137.0 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 34304000 | Self Similarity: 0.97


 31%|███       | 34006/110107 [14:19<27:45, 45.69it/s]

Sparsity: 136.6 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 34816000 | Self Similarity: 0.98


 31%|███▏      | 34506/110107 [14:30<27:33, 45.72it/s]

Sparsity: 139.4 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 35328000 | Self Similarity: 0.98


 32%|███▏      | 34996/110107 [14:41<27:03, 46.27it/s]

Sparsity: 139.1 | Dead Features: 1 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 35840000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 32%|███▏      | 35506/110107 [14:59<27:16, 45.60it/s]  

Sparsity: 146.2 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 36352000 | Self Similarity: 0.98


 33%|███▎      | 36006/110107 [15:10<26:55, 45.86it/s]

Sparsity: 142.7 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 36864000 | Self Similarity: 0.98


 33%|███▎      | 36496/110107 [15:20<26:27, 46.38it/s]

Sparsity: 133.6 | Dead Features: 1 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 37376000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 34%|███▎      | 36996/110107 [15:33<26:16, 46.36it/s]  

Sparsity: 132.6 | Dead Features: 1 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 37888000 | Self Similarity: 0.97


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 34%|███▍      | 37506/110107 [15:46<26:22, 45.87it/s]  

Sparsity: 142.4 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 38400000 | Self Similarity: 0.97


 35%|███▍      | 38006/110107 [15:57<26:12, 45.85it/s]

Sparsity: 134.4 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 38912000 | Self Similarity: 0.97


 35%|███▍      | 38496/110107 [16:07<25:50, 46.19it/s]

Sparsity: 131.8 | Dead Features: 1 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 39424000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 35%|███▌      | 39006/110107 [16:20<25:49, 45.88it/s]  

Sparsity: 137.0 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 39936000 | Self Similarity: 0.97


 36%|███▌      | 39506/110107 [16:31<26:10, 44.95it/s]

Sparsity: 138.7 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 40448000 | Self Similarity: 0.97


 36%|███▋      | 40006/110107 [16:47<4:33:17,  4.28it/s]

Sparsity: 139.6 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 40960000 | Self Similarity: 0.97


 37%|███▋      | 40506/110107 [16:58<25:11, 46.04it/s]  

Sparsity: 136.0 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 41472000 | Self Similarity: 0.98


 37%|███▋      | 41006/110107 [17:08<25:08, 45.81it/s]

Sparsity: 138.6 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 41984000 | Self Similarity: 0.98


 38%|███▊      | 41506/110107 [17:19<24:51, 45.98it/s]

Sparsity: 136.8 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 42496000 | Self Similarity: 0.97


 38%|███▊      | 42006/110107 [17:30<24:40, 46.00it/s]

Sparsity: 136.0 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 43008000 | Self Similarity: 0.97


 39%|███▊      | 42506/110107 [17:41<24:31, 45.94it/s]

Sparsity: 139.6 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 43520000 | Self Similarity: 0.98


 39%|███▉      | 42996/110107 [17:51<24:18, 46.02it/s]

Sparsity: 149.1 | Dead Features: 1 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 44032000 | Self Similarity: 0.98


Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow
 40%|███▉      | 43506/110107 [18:05<24:24, 45.46it/s]  

Sparsity: 143.7 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 44544000 | Self Similarity: 0.98


 40%|███▉      | 44006/110107 [18:16<24:10, 45.57it/s]

Sparsity: 141.5 | Dead Features: 0 | Total Loss: 0.10 | Reconstruction Loss: 0.06 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 45056000 | Self Similarity: 0.97


 40%|████      | 44506/110107 [18:26<23:56, 45.67it/s]

Sparsity: 134.4 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 45568000 | Self Similarity: 0.97


 41%|████      | 45006/110107 [18:43<4:22:17,  4.14it/s]

Sparsity: 138.0 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 46080000 | Self Similarity: 0.98


 41%|████▏     | 45506/110107 [18:53<23:30, 45.79it/s]  

Sparsity: 135.5 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 46592000 | Self Similarity: 0.98


 42%|████▏     | 46006/110107 [19:04<23:52, 44.75it/s]

Sparsity: 137.4 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 47104000 | Self Similarity: 0.98


 42%|████▏     | 46506/110107 [19:15<23:05, 45.90it/s]

Sparsity: 137.8 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 47616000 | Self Similarity: 0.98


 43%|████▎     | 47006/110107 [19:26<22:53, 45.95it/s]

Sparsity: 134.4 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 48128000 | Self Similarity: 0.98


 43%|████▎     | 47506/110107 [19:37<22:50, 45.69it/s]

Sparsity: 130.4 | Dead Features: 0 | Total Loss: 0.08 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 48640000 | Self Similarity: 0.98


 44%|████▎     | 48006/110107 [19:48<22:44, 45.52it/s]

Sparsity: 142.4 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 49152000 | Self Similarity: 0.97


 44%|████▍     | 48506/110107 [19:58<22:29, 45.64it/s]

Sparsity: 137.7 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.04 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 49664000 | Self Similarity: 0.98


 45%|████▍     | 49006/110107 [20:09<22:20, 45.56it/s]

Sparsity: 133.1 | Dead Features: 0 | Total Loss: 0.10 | Reconstruction Loss: 0.06 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 50176000 | Self Similarity: 0.97


 45%|████▍     | 49506/110107 [20:20<22:00, 45.88it/s]

Sparsity: 140.0 | Dead Features: 0 | Total Loss: 0.09 | Reconstruction Loss: 0.05 | L1 Loss: 0.04 | l1_alpha: 1.00e-03 | Tokens: 50688000 | Self Similarity: 0.97


 45%|████▌     | 50000/110107 [20:37<24:47, 40.42it/s]


RuntimeError: [enforce fail at inline_container.cc:337] . unexpected pos 1151820160 vs 1151820048

In [17]:
model_save_name = cfg.model_name.split("/")[-1]
save_name = f"{model_save_name}_sp{cfg.sparsity}_r{cfg.ratio}_{tensor_names[0]}"  # trim year

# Make directory traiend_models if it doesn't exist
import os
if not os.path.exists("trained_models"):
    os.makedirs("trained_models")
# Save model
torch.save(autoencoder, f"trained_models/{save_name}.pt")

# if not os.path.exists("activations"):
#     os.makedirs("activations")
# # Save model
# torch.save(saved_activations[:-1], f"activations/{save_name}.pt")

In [18]:
wandb.finish()

0,1
Dead Features,▁▁▁▁▁▂▃▂█▅▃▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
L1 Loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Reconstruction Loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Self Similarity,█▁▃▃▃▃▂▄▂▄▅▄▄▅▅▅▅▄▂▅▅▅▅▄▄▅▄▅▅▅▄▅▅▅▅▄▅▅▄▅
Sparsity,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Tokens,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Total Loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_alpha,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Dead Features,0.0
L1 Loss,0.04268
Reconstruction Loss,0.04672
Self Similarity,0.97426
Sparsity,139.97461
Tokens,50688000.0
Total Loss,0.08939
l1_alpha,0.001


In [9]:
target_model = AutoModelForCausalLM.from_pretrained(cfg.target_name).cpu()

model_save_name = cfg.model_name.split("/")[-1]
save_name = f"{model_save_name}_sp{cfg.sparsity}_r{cfg.ratio}_{tensor_names[0]}"  # trim year
autoencoder = torch.load(f"trained_models/{save_name}.pt")

In [31]:
# Initialize New autoencoder
from autoencoders.learned_dict import TiedSAE, UntiedSAE, AnthropicSAE, TransferSAE
from torch import nn

# params["decoder"] = torch.empty((n_dict_components, activation_size), device=cfg.device)
# nn.init.xavier_uniform_(params["decoder"])

params["decoder_bias"] = torch.empty((activation_size,), device=cfg.device)
nn.init.zeros_(params["decoder_bias"])

transfer_autoencoder = TransferSAE(
    # n_feats = n_dict_components, 
    # activation_size=activation_size,
    autoencoder,
    decoder=autoencoder.encoder.detach().clone(),
    decoder_bias=params["decoder_bias"],
)
transfer_autoencoder.to_device(cfg.device)

# Set gradient to true for decoder only- only training decoder on transfer
transfer_autoencoder.set_grad()
optimizer = torch.optim.Adam(
    [
        transfer_autoencoder.decoder,
        transfer_autoencoder.decoder_bias,
    ], lr=cfg.lr)


In [37]:
# Wandb setup
secrets = json.load(open("secrets.json"))
wandb.login(key=secrets["wandb_key"])
start_time = datetime.now().strftime("%Y%m%d-%H%M%S")
wandb_run_name = f"{cfg.target_name}_transfer_{start_time[4:]}_{cfg.sparsity}"  # trim year
print(f"wandb_run_name: {wandb_run_name}")
wandb.init(project="sparse coding", config=dict(cfg), name=wandb_run_name)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


wandb_run_name: EleutherAI/pythia-70m-deduped_transfer_1011-040220_51


In [38]:
def get_activations(model, inputs):
    acts = []
    for tokens in inputs:
        with torch.no_grad(): # As long as not doing KL divergence, don't need gradients for model
            with Trace(model, tensor_names[0]) as ret:
                _ = model(tokens)
                representation = ret.output
                if(isinstance(representation, tuple)):
                    representation = representation[0]
        layer_activations = rearrange(representation, "b seq d_model -> (b seq) d_model")
        acts.append(layer_activations.cpu())
    return acts

In [40]:
# Training transfer autoencoder
token_loader = setup_token_data(cfg, tokenizer, model, seed=cfg.seed)
dead_features = torch.zeros(transfer_autoencoder.encoder.shape[0])
max_num_tokens = 100_000_000
# Freeze model parameters 
model = model.to(cfg.device)
target_model = target_model.cpu()
target_model.eval()
target_model.requires_grad_(False)

last_decoder = transfer_autoencoder.decoder.clone().detach()
model_on_gpu = True

saved_inputs = []
i = 0 # counts all optimization steps
for k, (batch) in enumerate(token_loader):
    saved_inputs.append(batch["input_ids"].to(cfg.device))
    
    if (k+1)%500==0:
        # compute base and target model activations
        if model_on_gpu:
            base_activations = get_activations(model, saved_inputs)
            model = model.cpu()
            target_model = target_model.to(cfg.device)
        target_activations = get_activations(target_model, saved_inputs)
        if not model_on_gpu:
            target_model = target_model.cpu()
            model = model.to(cfg.device)
            base_activations = get_activations(model, saved_inputs)
        model_on_gpu = not model_on_gpu

        # wipe saved inputs
        saved_inputs = []
        
        # train autoencoder on activations:
        for (base_activation, target_activation) in (zip(base_activations, target_activations)):
            c = transfer_autoencoder.encode(base_activation.to(cfg.device))
            x_hat = transfer_autoencoder.decode(c)
            
            reconstruction_loss = (x_hat - target_activation.to(cfg.device)).pow(2).mean()
            total_loss = reconstruction_loss # NO L1 LOSS

            dead_features += c.sum(dim=0).cpu()
            if (i % 500 == 0): # Check here so first check is model w/o change
                self_similarity = torch.cosine_similarity(transfer_autoencoder.decoder, last_decoder, dim=-1).mean().cpu().item()
                last_decoder = transfer_autoencoder.decoder.clone().detach()
                num_tokens_so_far = i*cfg.max_length*cfg.model_batch_size
                with torch.no_grad():
                    sparsity = (c != 0).float().mean(dim=0).sum().cpu().item()
                    # Count number of dead_features are zero
                    num_dead_features = (dead_features == 0).sum().item()
                print(f"Sparsity: {sparsity:.1f} | Dead Features: {num_dead_features} | Reconstruction Loss: {reconstruction_loss:.2f} | Tokens: {num_tokens_so_far} | Self Similarity: {self_similarity:.2f}")
                wandb.log({
                    'Sparsity': sparsity,
                    'Dead Features': num_dead_features,
                    'Reconstruction Loss': reconstruction_loss.item(),
                    'Tokens': num_tokens_so_far,
                    'Self Similarity': self_similarity
                })
                dead_features = torch.zeros(transfer_autoencoder.encoder.shape[0])
                
                if(num_tokens_so_far > max_num_tokens):
                    print(f"Reached max number of tokens: {max_num_tokens}")
                    break

            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()
            i+=1

            # # Running sparsity check
            # if(num_tokens_so_far > 500000):
            #     if(i % 200 == 0):
            #         with torch.no_grad():
            #             sparsity = (c != 0).float().mean(dim=0).sum().cpu().item()
            #         if sparsity > target_upper_sparsity:
            #             cfg.l1_alpha *= (1 + adjustment_factor)
            #         elif sparsity < target_lower_sparsity:
            #             cfg.l1_alpha *= (1 - adjustment_factor)
            #         # print(f"Sparsity: {sparsity:.1f} | l1_alpha: {cfg.l1_alpha:.2e}")

Found cached dataset parquet (/root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached processed dataset at /root/.cache/huggingface/datasets/Elriggs___parquet/Elriggs--openwebtext-100k-79076ecafee8a6d5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8401ec7d4dbd84d2_*_of_00008.arrow


Sparsity: 143.0 | Dead Features: 1 | Reconstruction Loss: 0.86 | Tokens: 0 | Self Similarity: 1.00
Sparsity: 145.0 | Dead Features: 0 | Reconstruction Loss: 0.06 | Tokens: 512000 | Self Similarity: 0.84
Sparsity: 143.0 | Dead Features: 0 | Reconstruction Loss: 0.05 | Tokens: 1024000 | Self Similarity: 0.98
Sparsity: 139.8 | Dead Features: 0 | Reconstruction Loss: 0.04 | Tokens: 1536000 | Self Similarity: 0.99
Sparsity: 138.9 | Dead Features: 0 | Reconstruction Loss: 0.04 | Tokens: 2048000 | Self Similarity: 0.99
Sparsity: 140.2 | Dead Features: 0 | Reconstruction Loss: 0.04 | Tokens: 2560000 | Self Similarity: 0.99
Sparsity: 136.2 | Dead Features: 0 | Reconstruction Loss: 0.04 | Tokens: 3072000 | Self Similarity: 0.99
Sparsity: 140.2 | Dead Features: 0 | Reconstruction Loss: 0.04 | Tokens: 3584000 | Self Similarity: 0.99
Sparsity: 138.6 | Dead Features: 0 | Reconstruction Loss: 0.04 | Tokens: 4096000 | Self Similarity: 0.99
Sparsity: 137.1 | Dead Features: 0 | Reconstruction Loss: 0.04

KeyboardInterrupt: 

In [41]:
model_save_name = cfg.target_name.split("/")[-1]
save_name = f"{model_save_name}_transfer_sp{cfg.sparsity}_r{cfg.ratio}_{tensor_names[0]}"  # trim year

# Make directory traiend_models if it doesn't exist
import os
if not os.path.exists("trained_models"):
    os.makedirs("trained_models")
# Save model
torch.save(transfer_autoencoder, f"trained_models/{save_name}.pt")

In [36]:
wandb.finish()

In [None]:
cfg

{'model_name': 'EleutherAI/pythia-70m-deduped',
 'layers': [4],
 'setting': 'residual',
 'tensor_name': 'gpt_neox.layers.{layer}',
 'l1_alpha': 0.0020591228579666505,
 'sparsity': 51,
 'num_epochs': 10,
 'model_batch_size': 4,
 'lr': 0.001,
 'kl': False,
 'reconstruction': False,
 'dataset_name': 'NeelNanda/pile-10k',
 'device': 'cuda:0',
 'ratio': 4,
 'max_length': 256}