# Notebook 2: Embedding Generation using GPT-2

**Objective:** Load the pre-trained GPT-2 model and tokenizer to generate static embeddings for:
1. The 100 curated occupations (from the validated dictionary created in Notebook 1).
2. The standard gender anchor terms ('he', 'she', 'man', 'woman').

**Method:** We will use **Masked Mean Pooling** over the token embeddings from the last hidden layer of GPT-2 to obtain a single vector representation for each occupation/term.

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2Model
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter!
from pathlib import Path
import os

## 2. Configuration

In [2]:
# --- Paths ---
# Get project root assuming the notebook is in 'notebooks' directory
current_dir = Path.cwd()
project_root = current_dir.parent

In [3]:
# Input file (validated dictionary from Notebook 1)
INPUT_CSV_FILE = project_root / 'results' / 'occupation_dictionary_validated.csv'

In [4]:
# Output file for embeddings
RESULTS_DIR = project_root / 'results'
EMBEDDING_OUTPUT_FILE = RESULTS_DIR / 'gpt2_static_embeddings.npz'

In [5]:
# --- Model Configuration ---
MODEL_NAME = 'gpt2'
GENDER_TERMS = ['he', 'she', 'man', 'woman']

In [6]:
# Create results directory if it doesn't exist (it should from Notebook 1, but check again)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

## 3. Setup Device

In [7]:
# Use GPU if available, otherwise CPU
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
    print("Using GPU:", torch.cuda.get_device_name(0))
elif torch.backends.mps.is_available():
    DEVICE = torch.device('mps') # For Apple Silicon GPUs
    print("Using MPS (Apple Silicon GPU)")
else:
    DEVICE = torch.device('cpu')
    print("Using CPU")

Using MPS (Apple Silicon GPU)


## 4. Load Tokenizer and Model

In [17]:
print(f"\nLoading {MODEL_NAME} tokenizer and model...")

tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
model = GPT2Model.from_pretrained(MODEL_NAME)

# --- Handle Padding Token ---
# GPT-2 doesn't have a default PAD token, use EOS token instead
if tokenizer.pad_token is None:
    print("Tokenizer lacks padding token. Setting EOS token as PAD token.")
    tokenizer.pad_token = tokenizer.eos_token
    # Important: Resize model embeddings to accommodate the change if PAD was truly added
    # Although using EOS doesn't add a *new* token, explicitly setting it clarifies padding behavior.
    # model.resize_token_embeddings(len(tokenizer)) # Usually not needed if just assigning existing EOS

# Move model to the selected device
model.to(DEVICE)

# Set model to evaluation mode (disables dropout, etc.) # Set model to evaluation mode (disables dropout, etc.)
model.eval()


Loading gpt2 tokenizer and model...
Tokenizer lacks padding token. Setting EOS token as PAD token.


GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

## 5. Define Embedding Function (Masked Mean Pooling)

In [18]:
def get_embedding(text, tokenizer, model, device):
    """
    Generates an embedding for the input text using masked mean pooling
    over the last hidden state of the provided transformer model.

    Args:
        text (str): The input text (word or phrase).
        tokenizer: The loaded Hugging Face tokenizer.
        model: The loaded Hugging Face model.
        device: The torch device ('cuda', 'mps', or 'cpu').

    Returns:
        np.ndarray or None: The embedding vector as a NumPy array,
                             or None if an error occurs or input is invalid.
    """
    if not isinstance(text, str) or not text.strip():
        print(f"Warning: Invalid input text provided: '{text}'. Skipping.")
        return None

    try:
        # Tokenize input - Ensure padding and truncation
        inputs = tokenizer(
            text,
            return_tensors="pt",
            padding=True,        # Pad to the longest sequence in the batch (or max_length if specified)
            truncation=True,     # Truncate sequences longer than model max length
            max_length=tokenizer.model_max_length,
            return_attention_mask=True
        ).to(device)

        # Get model output without calculating gradients
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract last hidden states (batch_size, sequence_length, hidden_size)
        last_hidden_states = outputs.last_hidden_state

        # --- Masked Mean Pooling ---
        # Get attention mask (batch_size, sequence_length)
        attention_mask = inputs['attention_mask']
        # Expand mask dimensions to match hidden states: (batch_size, sequence_length, hidden_size)
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
        # Zero out padding tokens' embeddings
        masked_embeddings = last_hidden_states * mask_expanded
        # Sum embeddings across the sequence dimension (dim=1)
        summed_embeddings = torch.sum(masked_embeddings, 1)
        # Count actual (non-padding) tokens
        # Sum the attention mask across sequence dimension (dim=1)
        # Clamp ensures count is at least 1 to avoid division by zero
        token_counts = torch.clamp(mask_expanded.sum(1), min=1e-9)
        # Calculate the mean
        mean_pooled_embedding = summed_embeddings / token_counts
        # --- End Masked Mean Pooling ---

        # Move embedding to CPU and convert to NumPy array
        # Squeeze removes the batch dimension (assumes batch size 1 here)
        embedding_np = mean_pooled_embedding.squeeze().cpu().numpy()

        # Check for NaNs in the final embedding
        if np.isnan(embedding_np).any():
             print(f"Warning: NaN detected in embedding for '{text}'. Skipping.")
             return None

        return embedding_np

    except Exception as e:
        print(f"Error getting embedding for '{text}': {e}")
        return None

In [19]:
# Test the function with a sample word
test_word = "example"
test_embedding = get_embedding(test_word, tokenizer, model, DEVICE)
if test_embedding is not None:
    print(f"\nSuccessfully generated test embedding for '{test_word}' with shape: {test_embedding.shape}")
else:
    print(f"\nFailed to generate test embedding for '{test_word}'. Check function/model.")


Successfully generated test embedding for 'example' with shape: (768,)


## 6. Load Occupations

In [20]:
print(f"\nLoading occupations from: {INPUT_CSV_FILE}")
try:
    df_occupations = pd.read_csv(INPUT_CSV_FILE)
    occupation_list = df_occupations['occupation'].dropna().unique().tolist()
    print(f"Loaded {len(occupation_list)} unique occupations.")
    if len(occupation_list) != 100:
         print(f"Warning: Expected 100 occupations based on paper, but found {len(occupation_list)} unique entries.")
except FileNotFoundError:
     print(f"Error: Validated dictionary file not found at {INPUT_CSV_FILE}")
     print("Please ensure Notebook 1 was run successfully and the file exists.")
     raise
except KeyError:
    print(f"Error: Column 'occupation' not found in {INPUT_CSV_FILE}.")
    raise
except Exception as e:
    print(f"Error reading occupation data: {e}")
    raise


Loading occupations from: /Users/jessie/Documents/Projects/master_thesis_llms_bias/results/occupation_dictionary_validated.csv
Loaded 100 unique occupations.


## 7. Generate Embeddings

In [24]:
# --- Embed Gender Terms ---
embeddings = {}
failed_terms = []
for term in tqdm(GENDER_TERMS, desc="Gender Terms"):
    emb = get_embedding(term, tokenizer, model, DEVICE)
    if emb is not None:
        embeddings[term] = emb
    else:
        failed_terms.append(term)
        print(f"FATAL: Failed to get embedding for essential gender term '{term}'. Cannot proceed.")
        # Optional: Raise an error or exit if essential terms fail
        # raise ValueError(f"Embedding failed for gender term: {term}")
        
if not all(term in embeddings for term in GENDER_TERMS):
     raise ValueError("Failed to generate embeddings for one or more essential gender terms.")
else:
    print("Gender term embeddings generated successfully.")

Gender Terms:   0%|          | 0/4 [00:00<?, ?it/s]

Gender term embeddings generated successfully.


In [25]:
# --- Embed Occupations ---
print("\nEmbedding occupations...")
for occupation in tqdm(occupation_list, desc="Occupations"):
    # Optional: Clean again just in case, though should be clean from Notebook 1
    occupation_clean = occupation.strip()
    if not occupation_clean:
        failed_terms.append(f"(Empty Occupation: Original '{occupation}')")
        continue

    emb = get_embedding(occupation_clean, tokenizer, model, DEVICE)
    if emb is not None:
        embeddings[occupation] = emb # Use original name as key for consistency
    else:
        failed_terms.append(occupation)
        
print(f"\nEmbedding generation complete.")
print(f"Successfully generated embeddings for {len(embeddings)} terms.")
if failed_terms:
    print(f"Warning: Failed to generate embeddings for {len(failed_terms)} terms:")
    # Print only the first few failed terms for brevity
    print(failed_terms[:20]) # Adjust number as needed


Embedding occupations...


Occupations:   0%|          | 0/100 [00:00<?, ?it/s]


Embedding generation complete.
Successfully generated embeddings for 104 terms.


## 8. Save Embeddings

In [26]:
try:
    # Use np.savez_compressed for potentially smaller file size
    np.savez_compressed(EMBEDDING_OUTPUT_FILE, **embeddings)
    print("Embeddings saved successfully.")
except Exception as e:
    print(f"Error saving embeddings: {e}")

Embeddings saved successfully.
