In [1]:
import json
import torch
from transformers import GPT2Tokenizer, GPT2Model
import numpy as np
from tqdm import tqdm

# Load GPT-2
model_name = "openai-community/gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2Model.from_pretrained(model_name, output_hidden_states=True)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Layers to extract
target_layers = [3, 6, 9, 12]

# Helper function
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)

    hidden_states = outputs.hidden_states  # tuple of (13, batch, seq_len, hidden_dim)
    pooled = []
    for layer in target_layers:
        tensor = hidden_states[layer][0]  # [seq_len, hidden_dim]
        mean_vec = tensor.mean(dim=0).cpu().numpy()  # [hidden_dim]
        pooled.append(mean_vec)
    return np.concatenate(pooled)  # [len(layers) * hidden_dim]

# Load poem files
with open("poetic_neuroscience.json", "r", encoding="utf-8") as f:
    maria_poems = json.load(f)

with open("shakespeare_gutenberg.json", "r", encoding="utf-8") as f:
    shakespeare_plays = json.load(f)

# Extract embeddings
maria_embeddings = [get_embedding(entry["text"]) for entry in tqdm(maria_poems, desc="Maria Poems")]
shakespeare_embeddings = [get_embedding(entry["text"]) for entry in tqdm(shakespeare_plays, desc="Shakespeare Plays")]

# Save as .npy files
np.save("maria_gpt2.npy", np.stack(maria_embeddings))
np.save("shakespeare_gpt2.npy", np.stack(shakespeare_embeddings))

print("Embeddings saved as maria_gpt2.npy and shakespeare_gpt2.npy")


  from .autonotebook import tqdm as notebook_tqdm
Maria Poems: 100%|██████████| 154/154 [00:02<00:00, 75.56it/s] 
Shakespeare Plays: 100%|██████████| 13/13 [00:06<00:00,  2.11it/s]

Embeddings saved as maria_gpt2.npy and shakespeare_gpt2.npy





In [4]:
print(maria_embeddings[0].shape)

(3072,)
