### HF Auth

In [None]:
!hf auth login

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: write).
The token `training` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `training`


### Probing Dataset Generator

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

class ProbingDatasetGenerator:
    """A class to generate model outputs for probing datasets."""
    def __init__(self, model_name, device="cuda"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)

        self.model = self.model.to(device)
        self.device = device

    def _generate_batch(self, texts):
        """
        Generates model outputs for a batch of inputs.

        Args:
            texts (list): A list of input data formatted for the tokenizer.

        Returns:
            outputs: The model outputs after processing the batch.
        """
        batch = [
            [{"role": "user", "content": text}] for text in texts
        ]

        encoded = self.tokenizer.apply_chat_template(
            batch,
            add_generation_prompt=True,
            tokenize=True,
            padding=True,
            return_dict=True,
            return_tensors="pt",
        )
        encoded = {k: v.to(self.device) for k, v in encoded.items()}

        with torch.no_grad():
            outputs = self.model(
                input_ids=encoded["input_ids"].to(self.device),
                attention_mask=encoded["attention_mask"].to(self.device),
                return_dict=True,
                output_hidden_states=True,
            )
        return outputs

    def get_activations(self, texts):
        """
        Retrieves hidden states.

        Args:
            texts (list): A list of input data formatted for the tokenizer.

        Returns:
            hidden_states: The hidden states.
        """

        outputs = self._generate_batch(texts)
        return outputs.hidden_states

    def store_activations(self, hidden_states, label, filepath):
        """
        Stores hidden states from a specific layer to a file.

        Args:
            layer_idx (int): The index of the layer from which to extract hidden states.
            batch (list): A list of input data formatted for the tokenizer.
            filepath (str): The path to the file where hidden states will be stored.
        """
        data = {
            "activations": {
                f"hidden_state{i}": hidden_state.cpu() for i, hidden_state in enumerate(hidden_states)
            },
            "label": label,
        }

        torch.save(data, filepath)

### Dataset Loaders

In [3]:
import kagglehub
from abc import ABC, abstractmethod
import pandas as pd
from datasets import load_dataset, load_from_disk

class DatasetLoader(ABC):

    @abstractmethod
    def download_dataset(self, dataset_name) -> str:
        """Downloads and loads the dataset from the specified filepath.

        Returns:
        path: Path to the downloaded dataset.
        """
        pass

    @abstractmethod
    def load_dataset(self) -> pd.DataFrame:
        """Loads the dataset from the specified filepath.

        Returns:
        data: The loaded dataset as a list of examples.
        """
        pass


class KaggleProbingDatasetLoader(DatasetLoader):
    """Class to download and load the probing dataset from Kaggle."""
    def download_dataset(self, dataset_name) -> str:
        """
        Downloads and loads the dataset from the specified filepath.

        Returns:
            path: Path to the downloaded dataset.
        """
        path = kagglehub.dataset_download(dataset_name)
        self.dataset_folder = path
        return path

    def load_dataset(self) -> pd.DataFrame:
        """
        Loads the dataset from the specified filepath.

        Returns:
            data: The loaded dataset as a list of examples.
        """
        path = self.dataset_folder + "/data_set_4.csv"
        data = pd.read_csv(path)
        return data

class HuggingFaceDatasetLoader(DatasetLoader):
    """Class to download and load the SQuAD dataset from Hugging Face."""
    def __init__(self):
        self.dataset_folder = None

    def download_dataset(self, dataset_name, split="train") -> str:
        ds = load_dataset(dataset_name, split=split)
        ds.save_to_disk(f"./{dataset_name}_{split}")
        self.dataset_folder = f"./{dataset_name}_{split}"
        return self.dataset_folder

    def load_dataset(self):
        return load_from_disk(self.dataset_folder)

### Dataset Collecting Pipeline

#### Instantiate dataset generator

In [4]:
generator = ProbingDatasetGenerator(
    model_name="google/gemma-3-1b-it",
    device="cuda"
)

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

#### Download SQUAD dataset with general questions

In [5]:
dataset_loader = HuggingFaceDatasetLoader()
dataset_loader.download_dataset("rajpurkar/squad", split="train")
dataset = dataset_loader.load_dataset()

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/87599 [00:00<?, ? examples/s]

#### Dataset preparation

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import os

drive_path = "/content/drive/MyDrive/probing"
os.makedirs(drive_path, exist_ok=True)

In [9]:
import random

# Create a random subset of the dataset
subset_size = 10_000
random_indices = random.sample(range(len(dataset)), subset_size)

# Select the random subset
random_subset = dataset.select(random_indices)

In [10]:
# Save the random indices to a file
with open(f"{drive_path}/subset_indices.txt", "w") as f:
    for idx in random_indices:
        f.write(f"{idx}\n")

#### Dataset collection

In [11]:
from tqdm import tqdm
import torch

batch_size = 32
label = 0
activations_folder = f"{drive_path}/activations-{label}"
os.makedirs(activations_folder, exist_ok=True)

for i in tqdm(range(0, len(random_subset), batch_size)):
    questions = random_subset['question'][i:i+batch_size]
    activations = generator.get_activations(texts=questions)

    generator.store_activations(
        activations,
        label,
        f"{activations_folder}/layers_activations_batch_{i//batch_size}.pt"
    )

100%|██████████| 313/313 [16:09<00:00,  3.10s/it]
