# Notebook 4: Correlation Analysis (Similarity Bias vs. BLS Demographics)

**Objective:** Investigate the statistical relationship between the calculated similarity-based gender bias (from Notebook 3) and the real-world gender demographics of the workforce (from BLS data prepared in Notebook 1). This involves:
1. Loading the similarity results and the validated occupation dictionary.
2. Merging the datasets.
3. Calculating the demographic gender bias from BLS data (Male % - Female %).
4. Calculating Pearson and Spearman correlation coefficients between the similarity bias and BLS demographic bias.
5. Generating a scatter plot to visualize the relationship (similar to Figure 5).

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats # For correlation calculation
from pathlib import Path
import os

## 2. Configuration

In [2]:
# --- Paths ---
# Get project root assuming the notebook is in 'notebooks' directory
current_dir = Path.cwd()
project_root = current_dir.parent

In [3]:
# Input files
SIMILARITY_RESULTS_CSV = project_root / 'results' / 'cosine_similarity_results.csv'
# Note: We use the *validated* dictionary from NB1 output, which already has ratios.
DICT_INPUT_CSV = project_root / 'results' / 'occupation_dictionary_validated.csv'

In [4]:
# Output files
RESULTS_DIR = project_root / 'results'
CORRELATION_PLOT_OUTPUT_PNG = RESULTS_DIR / 'correlation_similarity_vs_bls_figure5.png'
# Optional: Save the data used for the plot
CORRELATION_DATA_OUTPUT_CSV = RESULTS_DIR / 'correlation_analysis_sim_vs_bls_data.csv'

In [5]:
# --- Plotting Parameters ---
# Define specific colors for BLS labels (consistent with Notebook 3)
BLS_LABEL_COLORS = {
    'male-stereotyped': '#95B3D7',  # Blueish
    'neutral': '#9DCDA9',          # Greenish
    'female-stereotyped': '#FFB598' # Orangish/Reddish
}

In [6]:
# Create results directory if it doesn't exist
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

## 3. Load Data

In [8]:
# Load similarity results
try:
    df_similarity = pd.read_csv(SIMILARITY_RESULTS_CSV)
    print(f"Loaded similarity results for {len(df_similarity)} occupations.")
    # Check if necessary column exists
    if 'similarity_bias' not in df_similarity.columns:
        raise KeyError("Column 'similarity_bias' not found in the similarity results file.")
except FileNotFoundError:
    print(f"Error: Similarity results file not found at {SIMILARITY_RESULTS_CSV}")
    print("Please ensure Notebook 3 ran successfully.")
    raise
except KeyError as e:
    print(f"Error: {e}")
    raise
except Exception as e:
    print(f"Error loading similarity results CSV: {e}")
    raise

Loaded similarity results for 100 occupations.


In [9]:
# Load occupation dictionary (contains BLS ratios and labels)
try:
    df_dictionary = pd.read_csv(DICT_INPUT_CSV)
    print(f"Loaded dictionary with {len(df_dictionary)} entries.")
     # Check if necessary columns exist
    if 'bls_male_ratio' not in df_dictionary.columns or 'bls_female_ratio' not in df_dictionary.columns or 'bls_label' not in df_dictionary.columns:
        raise KeyError("Dictionary file must contain 'bls_male_ratio', 'bls_female_ratio', and 'bls_label'.")
except FileNotFoundError:
    print(f"Error: Dictionary file not found at {DICT_INPUT_CSV}")
    print("Please ensure Notebook 1 ran successfully.")
    raise
except KeyError as e:
     print(f"Error: {e}")
     raise
except Exception as e:
    print(f"Error loading dictionary CSV: {e}")
    raise

Loaded dictionary with 100 entries.


## 4. Merge and Prepare Data

In [11]:
# Select necessary columns before merge to avoid potential conflicts if columns overlap beyond 'occupation'
cols_from_dict = ['occupation', 'bls_male_ratio', 'bls_female_ratio', 'bls_label']
cols_from_sim = ['occupation', 'similarity_bias']

In [12]:
df_merged = pd.merge(
    df_dictionary[cols_from_dict],
    df_similarity[cols_from_sim],
    on='occupation',
    how='inner' # Use inner join to ensure only occupations present in both datasets are kept
)

In [13]:
# Calculate BLS Demographic Gender Bias
# Defined as (Male % - Female %), which is equivalent to (male_ratio - female_ratio)
df_merged['bls_demographic_bias'] = df_merged['bls_male_ratio'] - df_merged['bls_female_ratio']

In [14]:
# Drop rows with NaN values in the columns needed for correlation
cols_to_check_na = ['bls_demographic_bias', 'similarity_bias']
rows_before_na_drop = len(df_merged)
df_merged.dropna(subset=cols_to_check_na, inplace=True)
rows_after_na_drop = len(df_merged)

In [15]:
if rows_after_na_drop < rows_before_na_drop:
    print(f"Dropped {rows_before_na_drop - rows_after_na_drop} rows with missing values in correlation columns.")

In [16]:
if df_merged.empty:
    print("Error: No valid data remaining after merging and cleaning. Cannot perform correlation analysis.")
else:
    print(f"Prepared data for {len(df_merged)} occupations for correlation analysis.")
    print("\nSample of prepared data:")
    print(df_merged[['occupation', 'bls_label', 'bls_demographic_bias', 'similarity_bias']].head())

Prepared data for 100 occupations for correlation analysis.

Sample of prepared data:
            occupation           bls_label  bls_demographic_bias  \
0      chief executive             neutral                 0.340   
1              manager             neutral                 0.324   
2    marketing manager             neutral                -0.284   
3        sales manager             neutral                 0.342   
4  fundraising manager  female-stereotyped                -0.402   

   similarity_bias  
0        -0.000055  
1         0.000889  
2        -0.004225  
3        -0.000962  
4        -0.005224  


## 5. Calculate Correlation Coefficients

In [18]:
pearson_r, pearson_p = np.nan, np.nan
spearman_rho, spearman_p = np.nan, np.nan
correlation_text = "Correlation could not be calculated."

In [19]:
if not df_merged.empty and len(df_merged) > 1:
    try:
        # Pearson correlation (linear relationship)
        pearson_result = stats.pearsonr(df_merged['bls_demographic_bias'], df_merged['similarity_bias'])
        pearson_r, pearson_p = pearson_result.statistic, pearson_result.pvalue
        print(f"Pearson Correlation: r = {pearson_r:.4f}, p = {pearson_p:.4f}")

        # Spearman correlation (monotonic relationship, less sensitive to outliers)
        spearman_result = stats.spearmanr(df_merged['bls_demographic_bias'], df_merged['similarity_bias'])
        spearman_rho, spearman_p = spearman_result.correlation, spearman_result.pvalue
        print(f"Spearman Correlation: rho = {spearman_rho:.4f}, p = {spearman_p:.4f}")

        # Format text for plot annotation (similar to Figure 5)
        correlation_text = (
            f'Pearson: r={pearson_r:.3f} (p={pearson_p:.3f})\n'
            f'Spearman: ρ={spearman_rho:.3f} (p={spearman_p:.3f})'
        ) # Using ρ symbol for Spearman

    except Exception as e:
        print(f"Error calculating correlations: {e}")
else:
    print("Skipping correlation calculation: Not enough valid data.")

Pearson Correlation: r = 0.0436, p = 0.6664
Spearman Correlation: rho = 0.0264, p = 0.7942


## 6. Generate Scatter Plot (Figure 5)

In [20]:
print("\nGenerating correlation scatter plot...")


Generating correlation scatter plot...


In [21]:
if df_merged.empty:
    print("Skipping scatter plot generation: No data available.")
else:
    # --- Create Plot ---
    try:
        plt.figure(figsize=(11, 9)) # Adjusted size slightly for better readability

        ax_scatter = sns.scatterplot(
            data=df_merged,
            x='bls_demographic_bias',
            y='similarity_bias',
            hue='bls_label',         # Color points by BLS label
            palette=BLS_LABEL_COLORS,# Use predefined colors
            s=70,                    # Point size
            alpha=0.8,               # Point transparency
            edgecolor="w",           # Point edge color
            linewidth=0.5
        )

        # Add regression line (linear fit)
        sns.regplot(
            data=df_merged,
            x='bls_demographic_bias',
            y='similarity_bias',
            scatter=False,           # Don't replot the points
            ci=95,                   # Show 95% confidence interval band
            line_kws={'color': 'black', 'linestyle': '--', 'linewidth': 1.5},
            ax=ax_scatter             # Ensure it plots on the same axes
        )

        # Add correlation text box
        # Position slightly adjusted from the python script example to match Fig 5 better
        plt.text(
            0.03, 0.97, # Position: 3% from left, 97% from bottom (top-left area)
            correlation_text,
            transform=ax_scatter.transAxes, # Coordinates relative to axes
            fontsize=10,
            verticalalignment='top',
            bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8) # White box background
        )

        # Set labels and title
        plt.title('Similarity-Based Gender Bias vs BLS Gender Bias', fontsize=16)
        plt.xlabel('BLS Gender Bias (Male % - Female %)', fontsize=12)
        plt.ylabel('Similarity-Based Gender Bias Score', fontsize=12)

        # Set axis limits (BLS bias is naturally -1 to 1)
        plt.xlim(-1.05, 1.05)
        # Optional: Adjust y-axis limits if needed, otherwise let matplotlib decide
        # sim_y_min, sim_y_max = df_merged['similarity_bias'].min(), df_merged['similarity_bias'].max()
        # sim_y_pad = max((sim_y_max - sim_y_min) * 0.1, 0.001) # Add padding
        # plt.ylim(sim_y_min - sim_y_pad, sim_y_max + sim_y_pad)


        # Add reference lines at zero
        plt.axhline(y=0, color='grey', linestyle=':', linewidth=1, alpha=0.7) # y=0 for similarity bias
        plt.axvline(x=0, color='grey', linestyle=':', linewidth=1, alpha=0.7) # x=0 for BLS bias

        # Adjust legend
        plt.legend(title='BLS Label', fontsize=10, title_fontsize=11, loc='upper left', bbox_to_anchor=(1.02, 1))

        plt.tight_layout(rect=[0, 0, 0.88, 1]) # Adjust layout for legend

        # Save the plot
        plt.savefig(CORRELATION_PLOT_OUTPUT_PNG, dpi=300, bbox_inches='tight')
        print(f"Correlation scatter plot saved successfully to {CORRELATION_PLOT_OUTPUT_PNG}")
        plt.close() # Close the plot figure

    except Exception as e:
        print(f"Error generating scatter plot: {e}")
        plt.close()

Correlation scatter plot saved successfully to /Users/jessie/Documents/Projects/master_thesis_llms_bias/results/correlation_similarity_vs_bls_figure5.png


## 7. Save Correlation Analysis Data (Optional)

In [23]:
print(f"\nSaving data used for correlation analysis to {CORRELATION_DATA_OUTPUT_CSV}...")
try:
    df_merged.to_csv(CORRELATION_DATA_OUTPUT_CSV, index=False, encoding='utf-8')
    print("Correlation data saved successfully.")
except Exception as e:
    print(f"Error saving correlation data: {e}")


Saving data used for correlation analysis to /Users/jessie/Documents/Projects/master_thesis_llms_bias/results/correlation_analysis_sim_vs_bls_data.csv...
Correlation data saved successfully.
