# Notebook 7: Semantic Clustering Analysis and Visualization

**Objective:** Perform unsupervised clustering (KMeans, k=2) on the contextualized occupation embeddings and visualize the results using Principal Component Analysis (PCA). This involves:
1. Loading the contextual embeddings and corresponding metadata (occupation names, stereotype labels) generated in Notebook 6.
2. Performing KMeans clustering to partition the occupations into two groups.
3. Applying PCA to reduce the dimensionality of the embeddings to 2D for visualization.
4. Creating a scatter plot of the PCA results, coloring points by gender stereotype and marking cluster centroids (similar to Figure 7).
5. Analyzing the composition of the resulting clusters.

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score # Optional: To evaluate clustering
from pathlib import Path
import os

## 2. Configuration

In [2]:
# --- Paths ---
# Get project root assuming the notebook is in 'notebooks' directory
current_dir = Path.cwd()
project_root = current_dir.parent

In [3]:
# Input files (from Notebook 6 outputs)
CLUSTER_RESULTS_DIR = project_root / 'results' / 'semantic_clustering'
CLUSTER_METADATA_CSV = CLUSTER_RESULTS_DIR / 'clustering_metadata.csv'
CLUSTER_EMBEDDINGS_NPZ = CLUSTER_RESULTS_DIR / 'clustering_contextual_embeddings.npz'

In [4]:
# Output file
PCA_PLOT_OUTPUT_PNG = CLUSTER_RESULTS_DIR / 'clustering_pca_plot_figure7.png'
CLUSTERING_RESULTS_CSV = CLUSTER_RESULTS_DIR / 'clustering_analysis_results.csv' # Save final df with clusters/pca

In [5]:
# --- Clustering Parameters ---
NUM_CLUSTERS = 2
RANDOM_SEED = 42 # For reproducibility of KMeans

In [6]:
# --- Plotting Parameters ---
# Define colors for stereotype labels (adjust as needed, ensure keys match labels in metadata csv)
# Based on Figure 7 legend in paper and provided scripts
STEREOTYPE_COLORS = {
    'male-stereotyped': '#1f77b4',      # Blue
    'male-stereotyped (proxy)': '#aec7e8',  # Lighter Blue (or same blue)
    'neutral': '#7f7f7f',               # Gray
    'neutral (proxy)': '#bdbdbd',       # Lighter Gray (or same gray)
    'female-stereotyped': '#d62728',    # Red
    # Add other categories if present in your metadata
}
CENTROID_MARKER = 'X'
CENTROID_COLOR = 'black'
CENTROID_SIZE = 200

In [7]:
# Create results directory if it doesn't exist
CLUSTER_RESULTS_DIR.mkdir(parents=True, exist_ok=True)

## 3. Load Data

In [8]:
# Load metadata
try:
    df_meta = pd.read_csv(CLUSTER_METADATA_CSV)
    print(f"Loaded metadata for {len(df_meta)} occupations.")
    if 'occupation' not in df_meta.columns or 'bls_label' not in df_meta.columns:
         raise ValueError("Metadata CSV must contain 'occupation' and 'bls_label' columns.")
except FileNotFoundError:
    print(f"Error: Metadata file not found at {CLUSTER_METADATA_CSV}")
    print("Please ensure Notebook 6 ran successfully.")
    raise
except Exception as e:
    print(f"Error loading metadata CSV: {e}")
    raise

Loaded metadata for 26 occupations.


In [9]:
# Load embeddings
try:
    embeddings_data = np.load(CLUSTER_EMBEDDINGS_NPZ, allow_pickle=True)
    # Convert NpzFile items to a standard dictionary {occupation_name: embedding}
    embeddings_dict = {key: embeddings_data[key] for key in embeddings_data.files}
    print(f"Loaded {len(embeddings_dict)} embeddings.")
except FileNotFoundError:
    print(f"Error: Embeddings file not found at {CLUSTER_EMBEDDINGS_NPZ}")
    print("Please ensure Notebook 6 ran successfully.")
    raise
except Exception as e:
    print(f"Error loading embeddings file: {e}")
    raise

Loaded 26 embeddings.


In [10]:
# --- Align Embeddings with Metadata ---
# Create the embedding matrix in the *same order* as the dataframe
ordered_embeddings = []
missing_embeddings = []
occupation_order = df_meta['occupation'].tolist() # Get order from DataFrame

In [11]:
for occ in occupation_order:
    if occ in embeddings_dict:
        ordered_embeddings.append(embeddings_dict[occ])
    else:
        missing_embeddings.append(occ)
        ordered_embeddings.append(np.zeros(list(embeddings_dict.values())[0].shape)) # Append zeros or handle differently

In [12]:
if missing_embeddings:
    print(f"Warning: Embeddings not found for {len(missing_embeddings)} occupations listed in metadata:")
    print(missing_embeddings)
    print("These occupations will likely have zero vectors and might affect clustering/PCA.")
    # Optional: Filter df_meta to only include occupations with embeddings
    # df_meta = df_meta[~df_meta['occupation'].isin(missing_embeddings)].reset_index(drop=True)
    # Re-extract ordered embeddings if df_meta was filtered... (logic gets more complex)
    # For now, proceeding with zeros for missing ones.

In [13]:
embedding_matrix = np.array(ordered_embeddings)

In [14]:
# Verify shapes
if embedding_matrix.shape[0] != len(df_meta):
    raise ValueError(f"Mismatch in number of metadata rows ({len(df_meta)}) and embeddings ({embedding_matrix.shape[0]}) after alignment.")
print(f"Aligned embedding matrix shape: {embedding_matrix.shape}")

Aligned embedding matrix shape: (26, 768)


## 4. Perform KMeans Clustering

In [15]:
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=RANDOM_SEED, n_init=10) # n_init='auto' in newer sklearn
try:
    cluster_labels = kmeans.fit_predict(embedding_matrix)
    centroids = kmeans.cluster_centers_

    # Add cluster labels to the DataFrame
    df_meta['cluster'] = cluster_labels
    print(f"Assigned occupations to {NUM_CLUSTERS} clusters.")

    # Optional: Calculate and print Silhouette Score
    try:
        silhouette_avg = silhouette_score(embedding_matrix, cluster_labels)
        print(f"Silhouette Score: {silhouette_avg:.4f} (higher is generally better, closer to 1)")
    except ValueError:
        # Silhouette score requires at least 2 unique labels and >1 sample per label
        print("Could not calculate Silhouette Score (possibly only one cluster assigned or too few samples).")
        
except Exception as e:
    print(f"Error during KMeans clustering: {e}")
    raise

Assigned occupations to 2 clusters.
Silhouette Score: 0.3661 (higher is generally better, closer to 1)


## 5. Perform PCA

In [16]:
pca = PCA(n_components=2, random_state=RANDOM_SEED)
try:
    pca_results = pca.fit_transform(embedding_matrix)
    # Also transform the centroids
    centroid_pca = pca.transform(centroids)

    # Add PCA components to the DataFrame
    df_meta['PC1'] = pca_results[:, 0]
    df_meta['PC2'] = pca_results[:, 1]
    print("PCA transformation complete.")

    # Print explained variance
    explained_variance = pca.explained_variance_ratio_
    print(f"Explained Variance Ratio - PC1: {explained_variance[0]:.4f}, PC2: {explained_variance[1]:.4f}")
    print(f"Total Explained Variance (2 components): {sum(explained_variance):.4f}")
    
except Exception as e:
    print(f"Error during PCA: {e}")
    raise

PCA transformation complete.
Explained Variance Ratio - PC1: 0.6978, PC2: 0.0981
Total Explained Variance (2 components): 0.7959


## 6. Save Clustering and PCA Results

In [17]:
print(f"\nSaving combined results (metadata, cluster, PCA) to {CLUSTERING_RESULTS_CSV}...")
try:
    df_meta.to_csv(CLUSTERING_RESULTS_CSV, index=False, encoding='utf-8')
    print("Combined results saved successfully.")
except Exception as e:
    print(f"Error saving combined results: {e}")


Saving combined results (metadata, cluster, PCA) to /Users/jessie/Documents/Projects/master_thesis_llms_bias/results/semantic_clustering/clustering_analysis_results.csv...
Combined results saved successfully.


## 7. Generate PCA Scatter Plot (Figure 7)

In [20]:
# ## 7. Generate PCA Scatter Plot (Figure 7)

print("\nGenerating PCA scatter plot (replicating Figure 7)...")

if df_meta.empty or 'PC1' not in df_meta.columns or 'PC2' not in df_meta.columns:
    print("Skipping plot generation: Missing data or PCA coordinates.")
else:
    # --- Prepare Data & Colors ---
    # Ensure the bls_label column matches the keys in the color map exactly
    # Clean up labels slightly if needed (e.g., remove extra spaces)
    df_meta['bls_label'] = df_meta['bls_label'].str.strip()

    # Define colors exactly matching the target plot's legend
    # Using standard color names or hex codes
    STEREOTYPE_COLORS_FIG7 = {
        'neutral': 'grey',                  # Or specific grey like '#7f7f7f'
        'male-stereotyped': 'tab:blue',     # Standard blue
        'neutral (proxy)': 'darkgrey',      # Slightly different grey for proxy
        'female-stereotyped': 'tab:red',      # Standard red
        'male-stereotyped (proxy)': 'lightsteelblue', # Lighter blue for proxy
         # Add any other categories if they exist in your data
    }
    # Define centroid colors/markers (using hatching is complex, mimic with face/edge color)
    CENTROID_COLORS = ['#ff7f0e', '#1f77b4'] # Orange for Cluster 0, Blue for Cluster 1

    # --- Create Plot ---
    try:
        plt.figure(figsize=(16, 14)) # Use a larger figure size similar to the reference
        sns.set_style("whitegrid")

        # --- Plot Data Points by Stereotype ---
        # Create the scatter plot using hue and the defined palette
        scatter_plot = sns.scatterplot(
            data=df_meta,
            x='PC1',
            y='PC2',
            hue='bls_label',    # Color points by stereotype label
            palette=STEREOTYPE_COLORS_FIG7, # Use the specific palette
            s=180,              # Slightly larger point size
            alpha=0.8,
            edgecolor='black',  # Add black edge to points for definition
            linewidth=0.5,
            legend='full'       # Ensure all hue levels appear in legend initially
        )

        # --- Add Occupation Labels ---
        for i in range(df_meta.shape[0]):
            plt.text(
                x=df_meta['PC1'][i] + 0.01, # Small offset to avoid overlap
                y=df_meta['PC2'][i] + 0.01,
                s=df_meta['occupation'][i],
                # Pass font properties as individual arguments:
                color='black',
                fontsize=11,  # Use 'fontsize' instead of 'size'
                alpha=0.9
            )
        # --- Plot Cluster Centroids ---
        # Ensure centroid_pca has the right shape (NUM_CLUSTERS, 2)
        if 'centroid_pca' in locals() and centroid_pca.shape == (NUM_CLUSTERS, 2):
            centroid_handles = []
            centroid_labels = []
            for i in range(NUM_CLUSTERS):
                # Plot each centroid with its specific color and label
                handle = plt.scatter(
                    centroid_pca[i, 0],
                    centroid_pca[i, 1],
                    marker=CENTROID_MARKER,
                    s=CENTROID_SIZE * 2, # Make centroids significantly larger
                    label=f'Cluster {i}.0 Center', # Label matching figure
                    # Mimic hatching: use facecolor for main color, edgecolor black
                    facecolor=CENTROID_COLORS[i],
                    edgecolor='black',
                    linewidth=1.5, # Thicker edge
                    zorder=5 # Ensure centroids are plotted on top
                )
                centroid_handles.append(handle)
                centroid_labels.append(f'Cluster {i}.0 Center')
        else:
             print("Warning: Centroid PCA coordinates not found or have incorrect shape. Skipping centroid plotting.")
             centroid_handles, centroid_labels = [], []


        # --- Final Touches ---
        plt.title('PCA of Occupation Embeddings with Gender Stereotypes', fontsize=18)
        plt.xlabel('First Principal Component', fontsize=14)
        plt.ylabel('Second Principal Component', fontsize=14)

        # --- Improve Legend ---
        # Get handles and labels from the main scatter plot (stereotypes)
        current_handles, current_labels = scatter_plot.get_legend_handles_labels()

        # Combine stereotype handles/labels with centroid handles/labels
        # Filter out potential title labels if seaborn added them
        filtered_handles = []
        filtered_labels = []
        for h, l in zip(current_handles, current_labels):
            if l in STEREOTYPE_COLORS_FIG7: # Keep only labels that match our defined stereotypes
                 filtered_handles.append(h)
                 filtered_labels.append(l)

        # Add the centroid handles/labels
        final_handles = filtered_handles + centroid_handles
        final_labels = filtered_labels + centroid_labels

        # Create the final legend
        plt.legend(handles=final_handles, labels=final_labels, title="Gender Stereotype", loc="upper right", fontsize=12, title_fontsize=14)


        # Optional: Add dashed circles around centroids (requires matplotlib.patches)
        try:
            import matplotlib.patches as mpatches
            if 'centroid_pca' in locals() and centroid_pca.shape == (NUM_CLUSTERS, 2):
                 for i in range(NUM_CLUSTERS):
                     # Adjust radius as needed based on cluster spread
                     circle_radius = 1.5 # Example radius, tune this value
                     circle = mpatches.Circle((centroid_pca[i, 0], centroid_pca[i, 1]),
                                               radius=circle_radius,
                                               fill=False,
                                               linestyle='--',
                                               edgecolor='gray', # Use gray for dashed circle
                                               linewidth=1.5,
                                               alpha=0.7)
                     plt.gca().add_patch(circle)
        except ImportError:
            print("matplotlib.patches not available, skipping centroid circles.")
        except Exception as e_circle:
             print(f"Could not draw centroid circles: {e_circle}")


        plt.tight_layout() # Adjust layout automatically

        # Save the plot
        plt.savefig(PCA_PLOT_OUTPUT_PNG, dpi=300, bbox_inches='tight')
        print(f"\nPCA scatter plot saved successfully to {PCA_PLOT_OUTPUT_PNG}")
        plt.close() # Close the plot figure

    except Exception as e:
        print(f"Error generating PCA scatter plot: {e}")
        plt.close()


Generating PCA scatter plot (replicating Figure 7)...

PCA scatter plot saved successfully to /Users/jessie/Documents/Projects/master_thesis_llms_bias/results/semantic_clustering/clustering_pca_plot_figure7.png


## 8. Analyze Cluster Composition (Optional)

In [19]:
if 'cluster' in df_meta.columns and 'bls_label' in df_meta.columns:
    # Contingency table: Cluster vs Stereotype Label
    contingency_table = pd.crosstab(df_meta['cluster'], df_meta['bls_label'])
    print("\nCluster Composition (Counts):")
    print(contingency_table)

    # Optional: Percentages within each cluster
    # contingency_percent = contingency_table.apply(lambda r: r/r.sum() * 100, axis=1)
    # print("\nCluster Composition (% within cluster):")
    # print(contingency_percent.round(1))
    
else:
    print("Skipping cluster composition analysis: 'cluster' or 'bls_label' column missing.")


Cluster Composition (Counts):
bls_label  female-stereotyped  male-stereotyped  male-stereotyped (proxy)  \
cluster                                                                     
0                           1                 2                         1   
1                           5                 3                         0   

bls_label  neutral  neutral (proxy)  
cluster                              
0                6                1  
1                5                2  
