### Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Model Activations

The model is defined from the transformers library and gets activations from a certain layer and from a certain dataset.

The activations come from the MLP layer after the RELU activation has been applied and before the layer which transforms this output back to residual dimension and back into the residual stream.

In [2]:
# extract_gpt2_mlp_activations.py
import os
import csv
import random
from tqdm import tqdm
import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.activations import ACT2FN

## Get Model

In [3]:
# Load model + tokenizer Function
def get_model_tokenizer(model_name = 'gpt2'):

  tokenizer = AutoTokenizer.from_pretrained("gpt2")
  if tokenizer.pad_token is None:
      tokenizer.pad_token = tokenizer.eos_token

  model = AutoModelForCausalLM.from_pretrained("gpt2", output_hidden_states=True)

  return tokenizer, model

In [4]:
tokenizer, model = get_model_tokenizer()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
# Send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
model.to(device)

print(f"Using device: {device}")

Using device: cuda


### Hidden Layers

In [6]:
if hasattr(model, "transformer") and hasattr(model.transformer, "h"):
    blocks = model.transformer.h

n_layers = len(blocks)
print("Model has", n_layers, "blocks.")

Model has 12 blocks.


In [7]:
blocks

ModuleList(
  (0-11): 12 x GPT2Block(
    (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (attn): GPT2Attention(
      (c_attn): Conv1D(nf=2304, nx=768)
      (c_proj): Conv1D(nf=768, nx=768)
      (attn_dropout): Dropout(p=0.1, inplace=False)
      (resid_dropout): Dropout(p=0.1, inplace=False)
    )
    (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (mlp): GPT2MLP(
      (c_fc): Conv1D(nf=3072, nx=768)
      (c_proj): Conv1D(nf=768, nx=3072)
      (act): NewGELUActivation()
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
)

We have decided to take the layer 9 activations for our experiments first up.

In [8]:
# Block to be decided from
LAYER_IDX = 11
sample_block = blocks[LAYER_IDX-1]


# inspect possible names
mlp_dim = sample_block.mlp.c_fc.nf
residual_dim = sample_block.attn.c_proj.nf

print("Residual dimension:", residual_dim)
print("Detected MLP activation dimension:", mlp_dim)

Residual dimension: 768
Detected MLP activation dimension: 3072


### Get hooks and add it to target block

In [56]:
target_layer_id = 11
target_block = blocks[target_layer_id - 1]

# storage
residual_pre_mlp = []
mlp_activations = []
hooks = []

In [57]:
# HOOK 1 — Residual BEFORE MLP
# Output of block BEFORE ln_2 → mlp
# ---------------------------
def hook_residual_pre_mlp(module, input):
    # Fires BEFORE ln_2 forward().
    # The input to ln_2 is exactly: (x + attn_output)
    # = residual stream BEFORE MLP.
    (hidden_states,) = input
    residual_pre_mlp.append(hidden_states.detach().cpu())

# ---------------------------
# HOOK 2 — MLP post-GELU activation
# The activation is the output of c_fc (3072 dims)
# before projection
# ---------------------------
def hook_mlp(module, input, output):
    """
    c_fc produces the 3072-dim pre-GELU activations.
    But HF GPT-2 places GELU *inside GPT2MLP*, not inside c_fc.
    So c_fc output = pre-GELU.
    We must apply GELU manually to get the true MLP activation.
    """
    pre_gelu = output
    post_gelu = torch.nn.functional.gelu(pre_gelu)
    mlp_activations.append(post_gelu.detach().cpu())

In [58]:
# register hooks
residual_handle = target_block.ln_2.register_forward_pre_hook(hook_residual_pre_mlp)
mlp_handle = target_block.mlp.c_fc.register_forward_hook(hook_mlp)

hooks.append(residual_handle)
hooks.append(mlp_handle)

### Activations of dummy data

In [59]:
# ---- RUN MODEL ON AN INPUT ----
text = "Market sentiment improved as inflation fell."
inputs = tokenizer(text, return_tensors="pt").to(model.device)

with torch.no_grad():
    _ = model(**inputs)

# ---- CHECK RESULTS ----
print("Residual pre-MLP shape:", residual_pre_mlp[0].shape)
print("MLP post-GELU shape:", mlp_activations[0].shape)

Residual pre-MLP shape: torch.Size([1, 7, 768])
MLP post-GELU shape: torch.Size([1, 7, 3072])


In [60]:
for h in hooks:
  h.remove()

#### See activations

In [61]:
token_activations = {}
for idx, token_id in enumerate(inputs.input_ids[0]):
    token = tokenizer.decode(token_id)
    token_activations[token] = {
        "residual_pre_mlp": residual_pre_mlp[0][0][idx],  # 768-dim tensor
        "mlp_post_gelu": mlp_activations[0][0][idx]          # 3072-dim tensor
    }

In [62]:
token_activations.keys()

dict_keys(['Market', ' sentiment', ' improved', ' as', ' inflation', ' fell', '.'])

## Financial PhraseBank Data

### Some exploration of hidden layers

In [None]:
text = "The company reported higher earnings this quarter."
inputs = tokenizer(text, return_tensors="pt").to(device)

# ✅ Pass output_hidden_states=True here during forward
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)

hidden_states = outputs.hidden_states

In [None]:
print(f"Number of layers: {len(hidden_states)}")
print(f"Shape of last layer: {hidden_states[-1].shape}")

Number of layers: 13
Shape of last layer: torch.Size([1, 8, 768])


### Get the data

In [None]:
from datasets import load_dataset

In [None]:
# # Use the correct dataset name and config
# dataset = load_dataset("takala/financial_phrasebank", "sentences_50agree")

# # Access the sentence/label fields
# sentences = dataset["train"]["sentence"]
# labels    = dataset["train"]["label"]

# print(len(sentences), sentences[:5], labels[:5])

In [None]:
# Use a csv instead
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SLDS_Project/data/all-data.csv",
                      encoding="latin1",
                      header=None,
                      names=["label", "sentence"])
dataset.head()

Unnamed: 0,label,sentence
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [None]:
dataset.shape

# # Trim data to 1000
# dataset = dataset.iloc[:1000]
# dataset.shape

(4846, 2)

In [None]:
sentences = dataset["sentence"].tolist()
labels    = dataset["label"].tolist()

### Get Intermediate activations and store in a dict

In [None]:
# Add hooks again
hooks = []
residual_handle = target_block.ln_2.register_forward_pre_hook(hook_residual_pre_mlp)
mlp_handle = target_block.mlp.c_fc.register_forward_hook(hook_mlp)

hooks.append(residual_handle)
hooks.append(mlp_handle)

In [None]:
all_activations = []

for sent in tqdm(sentences):
    # Clear previous activations
    residual_pre_mlp.clear()
    mlp_activations.clear()

    # Tokenize
    inputs = tokenizer(sent, return_tensors="pt").to(device)

    # Forward pass
    with torch.no_grad():
        _ = model(**inputs)

    # Extract tensors (shape: seq_len x hidden_dim)
    res_tensor = residual_pre_mlp[0].squeeze(0)  # 768-dim
    mlp_tensor = mlp_activations[0].squeeze(0)    # 3072-dim

    # Map tokens to activations
    token_dict = {}
    for i, token_id in enumerate(inputs.input_ids[0]):
        token = tokenizer.decode(token_id)
        token_dict[i] = {
            "token": token,
            "residual_pre_mlp": res_tensor[i].numpy(),
            "mlp_post_gelu": mlp_tensor[i].numpy()
        }

    # Append as a tuple
    all_activations.append((sent, token_dict))

100%|██████████| 4846/4846 [06:39<00:00, 12.12it/s]


### View activations and store

In [None]:
for sent, token_dict in all_activations[:5]:
    print(sent)
    print(token_dict.keys())

According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])
Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33])
The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41])
With the new production plant the company 

In [None]:
all_activations[0][0]  # the sentence string
all_activations[0][1]  # the position_dict for that sentence
all_activations[0][1][3]["token"]  # token at position 3
all_activations[0][1][3]["residual_pre_mlp"]  # 768-dim vector

array([ 3.80928367e-02, -2.17159891e+00, -1.38765883e+00, -2.43347216e+00,
        1.51229823e+00,  1.04938000e-01,  3.03786063e+00, -2.35575700e+00,
       -2.42494392e+00,  5.22102261e+00,  2.72749496e+00, -3.30801988e+00,
        4.12133837e+00,  2.72933030e+00,  2.59377688e-01,  2.72197461e+00,
       -4.30198765e+00, -1.49535990e+00,  2.57245302e+00, -3.80865979e+00,
        2.50523162e+00,  1.85031927e+00,  7.06775248e-01,  5.74389172e+00,
       -1.79784751e+00,  1.13075852e+00, -3.51824611e-01, -2.96911716e-01,
       -1.74400306e+00,  7.43943834e+00,  1.60815406e+00, -1.39937997e-01,
        2.47691274e+00,  5.40423679e+00,  1.27463043e-02, -2.54933167e+00,
       -8.03677678e-01,  3.05961990e+00, -2.04002118e+00,  3.40859032e+00,
        3.21756363e+00,  4.02732706e+00,  1.67228401e+00, -1.10432792e+00,
       -2.10715914e+00,  2.73420715e+00,  4.00489426e+00, -1.83907509e+00,
       -9.47790444e-02,  3.38409615e+00,  2.73247075e+00, -1.12960055e-01,
       -1.17832386e+00,  

In [None]:
# number of total tokens
total_tokens = sum(len(pos_dict) for _, pos_dict in all_activations)
print("Total tokens:", total_tokens)


Total tokens: 138405


### Store the activations

In [None]:
residuals_list = []
mlp_list = []
tokens_list = []
sentence_idx_list = []
position_idx_list = []

for sent_idx, (sentence, pos_dict) in enumerate(all_activations):
    for pos_idx, token_info in pos_dict.items():
        residuals_list.append(token_info["residual_pre_mlp"])
        mlp_list.append(token_info["mlp_post_gelu"])
        tokens_list.append(token_info["token"])
        sentence_idx_list.append(sent_idx)
        position_idx_list.append(pos_idx)

# Convert to tensors
residuals_tensor = torch.tensor(np.stack(residuals_list), dtype=torch.float32)  # total_tokens x 768
mlp_tensor = torch.tensor(np.stack(mlp_list), dtype=torch.float32)              # total_tokens x 3072
sentence_idx_tensor = torch.tensor(sentence_idx_list, dtype=torch.long)
position_idx_tensor = torch.tensor(position_idx_list, dtype=torch.long)

In [None]:
# Can save using torch.save or pickle
output_file = f"/content/drive/MyDrive/Colab Notebooks/SLDS_Project/token_activations/financial_phrasebank_gpt2_layer{target_layer_id}_residuals.pt"
torch.save({
    "residuals_post_attn": residuals_tensor,
    "tokens": tokens_list,
    "sentence_idx": sentence_idx_tensor,
    "position_idx": position_idx_tensor
}, output_file)

In [None]:
# Can save using torch.save or pickle
output_file = f"/content/drive/MyDrive/Colab Notebooks/SLDS_Project/token_activations/financial_phrasebank_gpt2_layer{target_layer_id}_mlp.pt"
torch.save({
    "mlp_post_gelu": mlp_tensor,
    "tokens": tokens_list,
    "sentence_idx": sentence_idx_tensor,
    "position_idx": position_idx_tensor
}, output_file)

## Some new Data - Huggingface

In [17]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset

In [19]:
# load dataset
ds = load_dataset("lukecarlate/english_finance_news", split="train")

# inspect
print(ds.column_names)
print(ds[0])

english_financial_news_v2.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/27144 [00:00<?, ? examples/s]

['newssource', 'newscontents', 'label']
{'newssource': 'auditor', 'newscontents': "Altia 's operating profit jumped to EUR 47 million from EUR 6.6 million .", 'label': 2}


In [24]:
news_texts = [row["newscontents"] for row in ds]

### Get Intermediate activations and store in a dict

In [63]:
# Add hooks again
hooks = []
residual_handle = target_block.ln_2.register_forward_pre_hook(hook_residual_pre_mlp)
mlp_handle = target_block.mlp.c_fc.register_forward_hook(hook_mlp)

hooks.append(residual_handle)
hooks.append(mlp_handle)

In [64]:
all_activations = []

for sent in tqdm(news_texts):
    # Clear previous activations
    residual_pre_mlp.clear()
    mlp_activations.clear()

    # Tokenize
    inputs = tokenizer(sent, return_tensors="pt").to(device)

    # Forward pass
    with torch.no_grad():
        _ = model(**inputs)

    # Extract tensors (shape: seq_len x hidden_dim)
    res_tensor = residual_pre_mlp[0].squeeze(0)  # 768-dim
    mlp_tensor = mlp_activations[0].squeeze(0)    # 3072-dim

    # Map tokens to activations
    token_dict = {}
    for i, token_id in enumerate(inputs.input_ids[0]):
        token = tokenizer.decode(token_id)
        token_dict[i] = {
            "token": token,
            "residual_pre_mlp": res_tensor[i].numpy(),
            "mlp_post_gelu": mlp_tensor[i].numpy()
        }

    # Append as a tuple
    all_activations.append((sent, token_dict))

100%|██████████| 27144/27144 [06:42<00:00, 67.46it/s]


In [65]:
for h in hooks:
  h.remove()

### View activations and store

In [66]:
for sent, token_dict in all_activations[:5]:
    print(sent)
    print(token_dict.keys())

Altia 's operating profit jumped to EUR 47 million from EUR 6.6 million .
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17])
The agreement was signed with Biohit Healthcare Ltd , the UK-based subsidiary of Biohit Oyj , a Finnish public company which develops , manufactures and markets liquid handling products and diagnostic test systems .
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38])
Kesko pursues a strategy of healthy , focused growth concentrating on sales and services to consumer-customers .
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])
Vaisala , headquartered in Helsinki in Finland , develops and manufactures electronic measurement systems for meteorology , environmental sciences , traffic and industry .
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24

In [67]:
residuals_list = []
mlp_list = []
tokens_list = []
sentence_idx_list = []
position_idx_list = []

for sent_idx, (sentence, pos_dict) in enumerate(all_activations):
    for pos_idx, token_info in pos_dict.items():
        residuals_list.append(token_info["residual_pre_mlp"])
        mlp_list.append(token_info["mlp_post_gelu"])
        tokens_list.append(token_info["token"])
        sentence_idx_list.append(sent_idx)
        position_idx_list.append(pos_idx)

# Convert to tensors
residuals_tensor = torch.tensor(np.stack(residuals_list), dtype=torch.float32)  # total_tokens x 768
mlp_tensor = torch.tensor(np.stack(mlp_list), dtype=torch.float32)              # total_tokens x 3072
sentence_idx_tensor = torch.tensor(sentence_idx_list, dtype=torch.long)
position_idx_tensor = torch.tensor(position_idx_list, dtype=torch.long)

In [68]:
target_layer_id

11

In [70]:
# Can save using torch.save or pickle
output_file = f"/content/drive/MyDrive/Colab Notebooks/SLDS_Project/token_activations/lukacarte_financial_gpt2_layer{target_layer_id}_residuals.pt"
torch.save({
    "residuals_post_attn": residuals_tensor,
    "tokens": tokens_list,
    "sentence_idx": sentence_idx_tensor,
    "position_idx": position_idx_tensor
}, output_file)