### Test notebook for development

In [1]:
try:
    import os
    from google.colab import drive
    drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/School/DS-GA 1011/capstone')
except:
    pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# ! pip install -r requirements.txt

In [3]:
import os
try:
  from google.colab import userdata
  hf_token = userdata.get_secret('HUGGING_FACE_TOKEN')
except:
  import os
  hf_token = os.getenv('HUGGING_FACE_TOKEN')
hf_login_command = f'huggingface-cli login --token {hf_token} --add-to-git-credential'
os.system(hf_login_command)

256

**Testing the model**

In [4]:
from src.model import AutoencoderConfig, SparseAutoencoder
from src.trainer import TrainingConfig, MonosemanticityTrainer
import torch
from unittest.mock import Mock
import torch.multiprocessing as mp
from src.evaluation import plot_training_metrics
from src.dataset import TextDataset, DataConfig, GPT2ActivationExtractor, GemmaActivationExtractor

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_config = AutoencoderConfig(
     input_dim=768,
    hidden_dim=1024,
)
trainer_config = TrainingConfig(
    batch_size=64,
    num_epochs= 10,
    mixed_precision= 'fp16',
    run_path='tests/',
    learning_rate=1e-4
)

In [6]:
# sample data
normal_dist = torch.distributions.Normal(loc=0, scale=1)
samples = normal_dist.sample(sample_shape=torch.Size([512,model_config.input_dim])).to(device)
dataset = torch.utils.data.TensorDataset(samples)
train_loader = torch.utils.data.DataLoader(dataset, batch_size = trainer_config.batch_size, shuffle=True)
# Mock extractor
extractor_mock = Mock(name='BaseActivationExtractorMock')
extractor_mock.extract_activations.return_value = {'activations': next(iter(train_loader))[0]}

In [7]:
# model and optimizer
model = SparseAutoencoder(model_config)
optimizer = torch.optim.Adam(model.parameters(), lr=trainer_config.learning_rate)
trainer = MonosemanticityTrainer(model, optimizer=optimizer, extractor=extractor_mock, train_config=trainer_config)

In [8]:
# training
trainer.train(train_loader)

INFO:src.trainer:Saving model to: /content/drive/MyDrive/School/DS-GA 1011/capstone/tests/model.pkl
  0%|          | 0/10 [00:00<?, ?it/s]INFO:src.trainer:
Epoch 1/10
DEBUG:src.trainer:Training epoch
 10%|█         | 1/10 [00:01<00:16,  1.85s/it]INFO:src.trainer:
Epoch 2/10
DEBUG:src.trainer:Training epoch
INFO:src.trainer:
Epoch 3/10
DEBUG:src.trainer:Training epoch
 30%|███       | 3/10 [00:02<00:03,  1.84it/s]INFO:src.trainer:
Epoch 4/10
DEBUG:src.trainer:Training epoch
 40%|████      | 4/10 [00:02<00:02,  2.37it/s]INFO:src.trainer:
Epoch 5/10
DEBUG:src.trainer:Training epoch
INFO:src.trainer:
Epoch 6/10
DEBUG:src.trainer:Training epoch
 60%|██████    | 6/10 [00:02<00:01,  3.95it/s]INFO:src.trainer:
Epoch 7/10
DEBUG:src.trainer:Training epoch
 70%|███████   | 7/10 [00:02<00:00,  4.63it/s]INFO:src.trainer:
Epoch 8/10
DEBUG:src.trainer:Training epoch
INFO:src.trainer:
Epoch 9/10
DEBUG:src.trainer:Training epoch
 90%|█████████ | 9/10 [00:02<00:00,  6.26it/s]INFO:src.trainer:
Epoch 10/1

{'train_loss': [1.0250835418701172,
  0.9157070517539978,
  0.8253010511398315,
  0.7402318716049194,
  0.6539566516876221,
  0.5645681023597717,
  0.4735264182090759,
  0.3845852017402649,
  0.30233022570610046,
  0.23080089688301086],
 'train_mse_loss': [1.00179123878479,
  0.8929671049118042,
  0.8022847175598145,
  0.7162207365036011,
  0.6283529996871948,
  0.5369112491607666,
  0.44351235032081604,
  0.35206830501556396,
  0.267313688993454,
  0.1934213638305664],
 'train_l1_regularization': [0.023292286321520805,
  0.022739913314580917,
  0.023016374558210373,
  0.024011168628931046,
  0.025603655725717545,
  0.02765684574842453,
  0.030014047399163246,
  0.03251691162586212,
  0.03501654416322708,
  0.037379518151283264]}

**Testing data generation**

In [9]:

dt_config = DataConfig(
    dataset_name="RealTimeData/wikitext_latest",
    split="train",
    text_column="text",
    model_name='gpt2',
    use_flash_attention=False
)
extractor = GPT2ActivationExtractor(dt_config)


INFO:src.dataset:Using gpt2 extractor on cuda


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
gpt2_extractor = GPT2ActivationExtractor(dt_config)


INFO:src.dataset:Using gpt2 extractor on cuda


In [11]:
be = gpt2_extractor.tokenizer(['I love you'])
res = gpt2_extractor.extract_activations(be, 5)
res['activations'], res['activations'].shape

(tensor([[[0.0000, 0.0000, 0.0600,  ..., 0.0000, 0.0294, 0.0407],
          [0.0222, 0.0000, 0.0000,  ..., 0.0000, 0.1515, 0.1571],
          [0.6869, 0.3262, 0.0000,  ..., 0.0000, 0.1399, 0.3196]]],
        device='cuda:0'),
 torch.Size([1, 3, 768]))

In [12]:

try:
    mp.set_start_method('spawn')
except RuntimeError:
    pass # method has already been set


dataset = TextDataset(
    tokenizer=gpt2_extractor.tokenizer,
    config=dt_config,
)
dataloader = dataset.get_dataloader(batch_size=64)

README.md:   0%|          | 0.00/805 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/826 [00:00<?, ? examples/s]

In [13]:
batch = next(iter(dataloader))
batch

['The Golden Road: How Ancient India Transformed the World is a 2024 history book by William Dalrymple. It discusses the ways in which India\'s ideas and influences spread throughout and shaped Eurasia.\n\n Overview \nThe book argues that the primary route connecting Eurasia from 250 BC to 1200 AD was a route going through India referred to in the book as the "Golden Road"; this route facilitated an Indian sphere of influence referred to as the Indosphere.\n\nIndia\'s outward influence began with the west coast of India interacting with the outside world as far west as the Roman Empire; the fall of Rome in the 5th and 6th centuries then forced Indian traders to turn their attention eastward, resulting in significant influence upon Southeast Asia. By the 7th century, Buddhism had penetrated China, with the reign of Wu Zetian resulting in a brief Indianization of the royal court and a general explosion of learning from India. And by the 13th century, Indian mathematical and astronomical 

In [14]:
trainer = MonosemanticityTrainer(model, optimizer=optimizer, extractor=extractor_mock, train_config=trainer_config)
metrics = trainer.train(dataloader)
metrics

INFO:src.trainer:Saving model to: /content/drive/MyDrive/School/DS-GA 1011/capstone/tests/model.pkl
  0%|          | 0/10 [00:00<?, ?it/s]INFO:src.trainer:
Epoch 1/10
DEBUG:src.trainer:Training epoch
 10%|█         | 1/10 [00:00<00:03,  2.42it/s]INFO:src.trainer:
Epoch 2/10
DEBUG:src.trainer:Training epoch
 20%|██        | 2/10 [00:00<00:03,  2.28it/s]INFO:src.trainer:
Epoch 3/10
DEBUG:src.trainer:Training epoch
 30%|███       | 3/10 [00:01<00:02,  2.36it/s]INFO:src.trainer:
Epoch 4/10
DEBUG:src.trainer:Training epoch
 40%|████      | 4/10 [00:01<00:02,  2.47it/s]INFO:src.trainer:
Epoch 5/10
DEBUG:src.trainer:Training epoch
 50%|█████     | 5/10 [00:02<00:01,  2.55it/s]INFO:src.trainer:
Epoch 6/10
DEBUG:src.trainer:Training epoch
 60%|██████    | 6/10 [00:02<00:01,  2.52it/s]INFO:src.trainer:
Epoch 7/10
DEBUG:src.trainer:Training epoch
 70%|███████   | 7/10 [00:02<00:01,  2.58it/s]INFO:src.trainer:
Epoch 8/10
DEBUG:src.trainer:Training epoch
 80%|████████  | 8/10 [00:03<00:00,  2.58it/

{'train_loss': [0.15805894136428833,
  0.098643459379673,
  0.06732356548309326,
  0.05331827327609062,
  0.04772385582327843,
  0.045427076518535614,
  0.044229328632354736,
  0.04336941987276077,
  0.042614251375198364,
  0.04189608618617058],
 'train_mse_loss': [0.11799036711454391,
  0.056049101054668427,
  0.02320094406604767,
  0.008556297048926353,
  0.002974497154355049,
  0.0010889292461797595,
  0.0005001163226552308,
  0.00032402598299086094,
  0.00026957711088471115,
  0.0002494312939234078],
 'train_l1_regularization': [0.040068574249744415,
  0.042594362050294876,
  0.04412262514233589,
  0.04476197436451912,
  0.04474935680627823,
  0.04433814436197281,
  0.04372921586036682,
  0.043045394122600555,
  0.042344674468040466,
  0.04164665564894676]}

In [15]:
plot_training_metrics(metrics, 'test.png')

In [19]:
extractor._get_final_layer().normalized_shape

(768,)

In [17]:
print(extractor.model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_a