# Notebook 6: Data Preparation for Semantic Clustering

**Objective:** Prepare the specific dataset (26 occupations from Bias in Bios) and generate *contextualized* embeddings using GPT-2 based on representative sentences. This involves:
1. Loading the list of target occupations and their stereotype labels for clustering.
2. Loading the dictionary of sampled sentences for these occupations.
3. Filtering out excluded occupations ('psychologist', 'surgeon').
4. Loading the GPT-2 model and tokenizer.
5. Defining a function to generate contextualized embeddings by averaging sentence embeddings.
6. Generating and saving the embeddings and the corresponding metadata (occupation list + labels).

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2Model
from tqdm.notebook import tqdm
from pathlib import Path
import os
import pickle

## 2. Configuration

In [2]:
# --- Paths ---
# Get project root assuming the notebook is in 'notebooks' directory
current_dir = Path.cwd()
project_root = current_dir.parent

In [12]:
# Input files
# Assumes a CSV listing occupations for clustering and their BLS labels
CLUSTER_OCCUPATIONS_CSV = project_root / 'data' / 'bib_bias_cluster_analysis.csv'
# Pickle file containing dict: {occupation_name: [sentence1, sentence2, ...]}
SENTENCES_PICKLE_FILE = project_root / 'data' / 'processed' / 'bib_sampled_sentences.pkl'

In [13]:
# Output files
RESULTS_DIR = project_root / 'results'
CLUSTER_RESULTS_DIR = RESULTS_DIR / 'semantic_clustering' # Subdirectory for clustering results
CLUSTER_METADATA_OUTPUT_CSV = CLUSTER_RESULTS_DIR / 'clustering_metadata.csv'
CLUSTER_EMBEDDINGS_OUTPUT_NPZ = CLUSTER_RESULTS_DIR / 'clustering_contextual_embeddings.npz'

In [14]:
# --- Model Configuration ---
MODEL_NAME = 'gpt2'
# Occupations mentioned in the paper to be excluded from clustering analysis
EXCLUDED_OCCUPATIONS = ['psychologist', 'surgeon']

In [15]:
# Create results directory if it doesn't exist
CLUSTER_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

## 3. Load Sentences and Occupation Metadata

In [16]:
# Load sentences
try:
    with open(SENTENCES_PICKLE_FILE, 'rb') as f:
        sentences_dict = pickle.load(f)
    print(f"Loaded sentences for {len(sentences_dict)} occupations from {SENTENCES_PICKLE_FILE}")
except FileNotFoundError:
    print(f"Error: Sentences file not found at {SENTENCES_PICKLE_FILE}")
    print("Please ensure the sentence extraction step was completed.")
    raise
except Exception as e:
    print(f"Error loading sentences pickle: {e}")
    raise

Loaded sentences for 28 occupations from /Users/jessie/Documents/Projects/master_thesis_llms_bias/data/processed/bib_sampled_sentences.pkl


In [17]:
# Load occupation list and stereotypes for clustering

df_cluster_occupations = pd.read_csv(CLUSTER_OCCUPATIONS_CSV)
print(f"Loaded metadata for {len(df_cluster_occupations)} occupations from {CLUSTER_OCCUPATIONS_CSV}")
# Validate required columns
required_cols = ['occupation', 'bls_label'] # Expecting these based on analysis of provided scripts
if not all(col in df_cluster_occupations.columns for col in required_cols):
     missing = [col for col in required_cols if col not in df_cluster_occupations.columns]
     raise ValueError(f"Missing required columns in {CLUSTER_OCCUPATIONS_CSV}: {missing}. Available: {df_cluster_occupations.columns.tolist()}")


# Clean occupation names
df_cluster_occupations['occupation'] = df_cluster_occupations['occupation'].str.strip()

Loaded metadata for 28 occupations from /Users/jessie/Documents/Projects/master_thesis_llms_bias/data/bib_bias_cluster_analysis.csv


## 4. Filter Occupations

In [19]:
# Filter based on EXCLUDED_OCCUPATIONS list
df_filtered_occupations = df_cluster_occupations[
    ~df_cluster_occupations['occupation'].isin(EXCLUDED_OCCUPATIONS)
].copy()

In [20]:
# Filter based on availability in sentences dictionary
original_count = len(df_filtered_occupations)
df_final_occupations = df_filtered_occupations[
    df_filtered_occupations['occupation'].isin(sentences_dict.keys())
].reset_index(drop=True)
final_count = len(df_final_occupations)

In [21]:
print(f"Excluded {len(EXCLUDED_OCCUPATIONS)} specific occupations.")
if final_count < original_count:
    print(f"Excluded {original_count - final_count} additional occupations not found in the sentences dictionary.")

Excluded 2 specific occupations.


In [22]:
# Verify the final count (expected to be 26 based on paper)
expected_count = 26
print(f"Final number of occupations for clustering: {final_count}")
if final_count != expected_count:
    print(f"Warning: Expected {expected_count} occupations after filtering, but got {final_count}.")

Final number of occupations for clustering: 26


In [23]:
# Keep only the sentences for the final list of occupations
final_occupation_list = df_final_occupations['occupation'].tolist()
final_sentences_dict = {occ: sentences_dict[occ] for occ in final_occupation_list if occ in sentences_dict}

## 5. Setup Model and Device

In [24]:
# Setup Device
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
    print("\nUsing GPU:", torch.cuda.get_device_name(0))
elif torch.backends.mps.is_available():
    DEVICE = torch.device('mps')
    print("\nUsing MPS (Apple Silicon GPU)")
else:
    DEVICE = torch.device('cpu')
    print("\nUsing CPU")


Using MPS (Apple Silicon GPU)


In [25]:
# Load Tokenizer and Model (same setup as Notebook 2)
print(f"Loading {MODEL_NAME} tokenizer and model...")
try:
    tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
    model = GPT2Model.from_pretrained(MODEL_NAME)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.to(DEVICE)
    model.eval()
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    raise

Loading gpt2 tokenizer and model...
Model and tokenizer loaded successfully.


## 6. Define Contextual Embedding Function

In [26]:
def get_contextual_embedding(occupation_name, sentences, tokenizer, model, device):
    """
    Generates a contextualized embedding for an occupation by averaging
    the embeddings of its representative sentences. Each sentence embedding
    is obtained via masked mean pooling over its tokens.

    Args:
        occupation_name (str): Name of the occupation (for logging).
        sentences (list[str]): List of sentences for the occupation.
        tokenizer: The loaded Hugging Face tokenizer.
        model: The loaded Hugging Face model.
        device: The torch device ('cuda', 'mps', or 'cpu').

    Returns:
        np.ndarray or None: The averaged embedding vector as a NumPy array,
                             or None if an error occurs or no valid sentence embeddings generated.
    """
    sentence_embeddings = []
    if not sentences:
        print(f"Warning: No sentences provided for occupation '{occupation_name}'. Skipping.")
        return None

    for sentence in sentences:
        if not isinstance(sentence, str) or not sentence.strip():
            # print(f"Skipping invalid sentence for {occupation_name}: {sentence}") # Optional: verbose logging
            continue
        try:
            # Tokenize input
            inputs = tokenizer(
                sentence,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=tokenizer.model_max_length, # Use model's max length
                return_attention_mask=True
            ).to(device)

            # Get model output
            with torch.no_grad():
                outputs = model(**inputs)
            last_hidden_states = outputs.last_hidden_state

            # Masked Mean Pooling for the sentence
            attention_mask = inputs['attention_mask']
            mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
            masked_embeddings = last_hidden_states * mask_expanded
            summed_embeddings = torch.sum(masked_embeddings, 1)
            token_counts = torch.clamp(mask_expanded.sum(1), min=1e-9)
            mean_pooled_embedding = summed_embeddings / token_counts

            embedding_np = mean_pooled_embedding.squeeze().cpu().numpy()

            if not np.isnan(embedding_np).any():
                sentence_embeddings.append(embedding_np)
            # else: # Optional: Log if a sentence embedding results in NaN
                # print(f"Warning: NaN detected in embedding for sentence of '{occupation_name}'. Skipping sentence.")

        except Exception as e:
            print(f"Error processing sentence for '{occupation_name}': {e}. Sentence: '{sentence[:100]}...'")
            continue # Skip this sentence, proceed to next

    # Average the embeddings for the occupation
    if sentence_embeddings:
        averaged_embedding = np.mean(sentence_embeddings, axis=0)
        return averaged_embedding
    else:
        print(f"Warning: No valid sentence embeddings generated for occupation '{occupation_name}'. Returning None.")
        return None

## 7. Generate Contextual Embeddings

In [27]:
contextual_embeddings = {}
failed_occupations = []

In [28]:
for occupation in tqdm(final_occupation_list, desc="Generating Embeddings"):
    occupation_sentences = final_sentences_dict.get(occupation, [])
    avg_emb = get_contextual_embedding(occupation, occupation_sentences, tokenizer, model, DEVICE)

    if avg_emb is not None:
        contextual_embeddings[occupation] = avg_emb
    else:
        failed_occupations.append(occupation)

Generating Embeddings:   0%|          | 0/26 [00:00<?, ?it/s]

In [29]:
print(f"\nContextual embedding generation complete.")
print(f"Successfully generated embeddings for {len(contextual_embeddings)} occupations.")
if failed_occupations:
    print(f"Warning: Failed to generate embeddings for {len(failed_occupations)} occupations:")
    print(failed_occupations)


Contextual embedding generation complete.
Successfully generated embeddings for 26 occupations.


In [30]:
# Remove failed occupations from the final dataframe if any failed
if failed_occupations:
     print("Removing failed occupations from metadata dataframe.")
     df_final_occupations = df_final_occupations[~df_final_occupations['occupation'].isin(failed_occupations)].reset_index(drop=True)
     print(f"Metadata dataframe now contains {len(df_final_occupations)} occupations.")

## 8. Save Metadata and Embeddings

In [31]:
print(f"\nSaving clustering metadata to: {CLUSTER_METADATA_OUTPUT_CSV}")
try:
    # Save only the metadata for occupations we successfully embedded
    df_final_occupations.to_csv(CLUSTER_METADATA_OUTPUT_CSV, index=False, encoding='utf-8')
    print("Metadata saved successfully.")
except Exception as e:
    print(f"Error saving metadata: {e}")


Saving clustering metadata to: /Users/jessie/Documents/Projects/master_thesis_llms_bias/results/semantic_clustering/clustering_metadata.csv
Metadata saved successfully.


In [32]:
print(f"\nSaving contextual embeddings to: {CLUSTER_EMBEDDINGS_OUTPUT_NPZ}")
try:
    # Ensure embeddings dict only contains successfully processed occupations
    final_embeddings_to_save = {occ: emb for occ, emb in contextual_embeddings.items() if occ in df_final_occupations['occupation'].values}
    np.savez_compressed(CLUSTER_EMBEDDINGS_OUTPUT_NPZ, **final_embeddings_to_save)
    print("Embeddings saved successfully.")
    print(f"Saved embeddings for {len(final_embeddings_to_save)} occupations.")
except Exception as e:
    print(f"Error saving embeddings: {e}")


Saving contextual embeddings to: /Users/jessie/Documents/Projects/master_thesis_llms_bias/results/semantic_clustering/clustering_contextual_embeddings.npz
Embeddings saved successfully.
Saved embeddings for 26 occupations.
