In [1]:
!pip install datasets -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## get layer

In [2]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from google.colab import drive, userdata
from tqdm import tqdm
import h5py  # For storing data in HDF5

# Setup
hf_token = userdata.get('HF_Meta')
model_name = "meta-llama/Llama-3.2-3B-Instruct"

# Load the dataset
dataset = load_dataset("matthieunlp/spatial_geometry")

# Load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=hf_token,
    output_hidden_states=True,
    device_map="auto"
)
model.eval()

# Prepare storing data
sentences = []
layer8_list = []
layer16_list = []
layer24_list = []

# 5. Iterate over dataset with tqdm progress bar
print("Extracting embeddings...")
for idx, entry in enumerate(tqdm(dataset["train"], desc="Rows")):
    # Get sentence
    sentence_text = entry['The table is above the chair.']

    # Tokenize
    inputs = tokenizer(sentence_text, return_tensors="pt", truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Inference
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

        hidden_states = outputs.hidden_states # tuple of length [number_of_layers + 1], shape: (batch_size, seq_len, hidden_dim)

        # last token's embedding: hidden_states[layer][:, -1, :]
        layer8 = hidden_states[8][:, -1, :].squeeze(0).cpu().numpy()
        layer16 = hidden_states[16][:, -1, :].squeeze(0).cpu().numpy()
        layer24 = hidden_states[24][:, -1, :].squeeze(0).cpu().numpy()

        # Append to our lists
        sentences.append(sentence_text)
        layer8_list.append(layer8)
        layer16_list.append(layer16)
        layer24_list.append(layer24)

    # Optional: break early for testing
    if idx >= 5:
        break

# Convert lists to NumPy arrays [num_sentences, hidden_dim]
layer8_array = np.stack(layer8_list, axis=0)
layer16_array = np.stack(layer16_list, axis=0)
layer24_array = np.stack(layer24_list, axis=0)

# save
save_path = "/content/drive/MyDrive/Llama-3.2-3B-Instruct_layer_embeddings.h5"

with h5py.File(save_path, "w") as f:
    # Create a dataset for sentences
    string_dt = h5py.special_dtype(vlen=str)
    dset_sentences = f.create_dataset("sentences", shape=(len(sentences),), dtype=string_dt)
    dset_sentences[:] = sentences

    # Create datasets for embeddings
    f.create_dataset("layer_8", data=layer8_array, compression="gzip")
    f.create_dataset("layer_16", data=layer16_array, compression="gzip")
    f.create_dataset("layer_24", data=layer24_array, compression="gzip")

print(f"Embeddings saved to: {save_path}")

# Summary Statistics
desired_layers = [8, 16, 24]
print("\n==== SUMMARY STATISTICS ====")
print(f"Total dataset rows processed: {len(dataset['train'])}")
print(f"Total sentences collected: {len(sentences)}")
print(f"Layers extracted: {desired_layers}")
print(f"Shape of each embedding array: {layer8_array.shape} (should match sentences)")

ModuleNotFoundError: No module named 'google.colab'

In [6]:
hdf5_path = "/content/drive/MyDrive/Llama-3.2-3B-Instruct_layer_embeddings.h5"

In [7]:
import h5py

with h5py.File(hdf5_path, "r") as f:
    # Load the sentences
    sentences_stored = f["sentences"][:]

    # Load the embeddings
    layer8_stored = f["layer_8"][:]       # shape: (num_sentences, hidden_dim)
    layer16_stored = f["layer_16"][:]
    layer24_stored = f["layer_24"][:]


In [12]:
from datasets import load_dataset

# Load dataset with more verbose options
dataset = load_dataset(
    "matthieunlp/spatial_geometry",
    use_auth_token=None,  # In case authentication is needed
    verification_mode='no_checks'
)

# More detailed inspection
print("\nDetailed dataset inspection:")
print("1. Dataset keys:", dataset.keys())
print("\n2. Train split column names:", dataset["train"].column_names)
print("\n3. First 10 rows:")
for i, row in enumerate(dataset["train"].select(range(10))):
    print(f"Row {i}:", row)
print("\n4. Dataset info:")
print(dataset["train"].info)

Generating train split:   0%|          | 0/142099 [00:00<?, ? examples/s]


Detailed dataset inspection:
1. Dataset keys: dict_keys(['train'])

2. Train split column names: ['The table is above the chair.']

3. First 10 rows:
Row 0: {'The table is above the chair.': 'The table is over the chair.'}
Row 1: {'The table is above the chair.': 'The table is on top of the chair.'}
Row 2: {'The table is above the chair.': 'The table is higher than the chair.'}
Row 3: {'The table is above the chair.': 'The table is elevated above the chair.'}
Row 4: {'The table is above the chair.': 'The table is to the left of the chair.'}
Row 5: {'The table is above the chair.': 'The table is in front of the chair.'}
Row 6: {'The table is above the chair.': 'The table is ahead of the chair.'}
Row 7: {'The table is above the chair.': 'The table is before the chair.'}
Row 8: {'The table is above the chair.': 'The table is inside the chair.'}
Row 9: {'The table is above the chair.': 'The table is within the chair.'}

4. Dataset info:
DatasetInfo(description='', citation='', homepage=''

In [None]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (rotary_emb

## end