# Notebook 1: Data Preparation - Loading the Occupation-Gender Dictionary

**Objective:** Load the pre-compiled occupation-gender stereotype dictionary, validate its contents, and prepare it for use in subsequent analysis notebooks. This dictionary is expected to contain the 100 curated occupations along with their BLS gender statistics and stereotype labels.

**Based on Project Structure:** This notebook assumes the primary input is `data/occupation_gender_data.csv`.

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path


## 2. Configuration

In [16]:
current_dir = Path.cwd()

# Assuming the notebook is in the 'notebooks' directory
project_root = current_dir.parent  # Go up one level to reach project root

data_dir = project_root / 'data'
raw_data_dir = data_dir / 'raw'
processed_data_dir = data_dir / 'processed'

# Create directories if they don't exist
raw_data_dir.mkdir(parents=True, exist_ok=True)
processed_data_dir.mkdir(parents=True, exist_ok=True)

# Define the input file path
INPUT_CSV_FILE = data_dir / 'occupation_gender_data.csv'

# Check if the file exists before trying to read it
if not INPUT_CSV_FILE.exists():
    print(f"Warning: The file {INPUT_CSV_FILE} does not exist.")
    print(f"Please ensure the file is placed in: {raw_data_dir}")


In [10]:
RESULTS_DIR = project_root / 'results'
# Create results directory if it doesn't exist
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
# Filename for the output dictionary
OUTPUT_CSV_FILE = RESULTS_DIR / 'occupation_dictionary_validated.csv'

In [11]:
# --- Expected Columns ---
# List the column names expected in the input CSV based on exp1.py and the paper
# Adjust these if your CSV uses different names
EXPECTED_COLUMNS = [
    'occupation',      # Curated occupation name
    'bls_female',      # Female ratio (e.g., 0.868) or percentage
    'bls_male',        # Male ratio (e.g., 0.132) or percentage
    'bls_label'        # Stereotype label ('female-stereotyped', 'male-stereotyped', 'neutral')
]
# Define which column represents the female proportion (ratio or percentage)
FEMALE_PROP_COL = 'bls_female'
# Define if the female proportion column is a percentage (True) or ratio (False)
IS_PERCENTAGE = False # Set to True if 'bls_female' is 0-100, False if 0-1

In [12]:
# Create results directory if it doesn't exist
os.makedirs(RESULTS_DIR, exist_ok=True)

## 3. Load Data

In [17]:
print(f"Loading occupation dictionary from: {INPUT_CSV_FILE}")

Loading occupation dictionary from: /Users/jessie/Documents/Projects/master_thesis_llms_bias/data/occupation_gender_data.csv


In [14]:
# Check if the input file exists before attempting to load it
if not os.path.exists(INPUT_CSV_FILE):
    print(f"Error: The input file '{INPUT_CSV_FILE}' does not exist.")
    print("Please ensure the file is located in the correct directory relative to this notebook.")
    print("If the file is missing, you may need to download or generate it.")
    raise FileNotFoundError(f"Input file not found: {INPUT_CSV_FILE}")

In [15]:
try:
    df_dictionary = pd.read_csv(INPUT_CSV_FILE)
    print(f"Successfully loaded {len(df_dictionary)} rows.")
except FileNotFoundError:
    print(f"Error: Input file not found at {INPUT_CSV_FILE}")
    print("Please ensure the path is correct relative to the notebook's location.")
    raise
except Exception as e:
    print(f"Error loading CSV: {e}")
    raise

Successfully loaded 100 rows.


## 4. Validate Data

In [19]:
# Check for expected columns
missing_cols = [col for col in EXPECTED_COLUMNS if col not in df_dictionary.columns]
if missing_cols:
    print(f"Error: Missing expected columns: {missing_cols}")
    print(f"Available columns are: {df_dictionary.columns.tolist()}")
    raise ValueError("Input CSV does not have the expected columns.")
else:
    print("All expected columns are present.")

All expected columns are present.


In [20]:
# Clean occupation names (remove leading/trailing whitespace)
df_dictionary['occupation'] = df_dictionary['occupation'].str.strip()

In [None]:
# Validate numeric columns (female/male proportions)
numeric_cols_to_check = ['bls_female', 'bls_male']
for col in numeric_cols_to_check:
    if col in df_dictionary.columns:
        # Convert to numeric, coercing errors to NaN
        original_nan_count = df_dictionary[col].isnull().sum()
        df_dictionary[col] = pd.to_numeric(df_dictionary[col], errors='coerce')
        new_nan_count = df_dictionary[col].isnull().sum()
        if new_nan_count > original_nan_count:
            print(f"Warning: Column '{col}' contained non-numeric values that were converted to NaN.")
        # Check for NaNs
        if df_dictionary[col].isnull().any():
             print(f"Warning: Column '{col}' contains NaN values.")

In [22]:
# Standardize Female/Male Ratios (ensure they are 0-1)
if FEMALE_PROP_COL in df_dictionary.columns:
    if IS_PERCENTAGE:
        print(f"Converting '{FEMALE_PROP_COL}' from percentage to ratio (dividing by 100).")
        df_dictionary['bls_female_ratio'] = df_dictionary[FEMALE_PROP_COL] / 100.0
    else:
        # Assume it's already a ratio, just copy/rename for consistency
        df_dictionary['bls_female_ratio'] = df_dictionary[FEMALE_PROP_COL]
        # Check if values are potentially percentages by mistake
        if (df_dictionary['bls_female_ratio'] > 1.0).any():
             print(f"Warning: Column '{FEMALE_PROP_COL}' was expected to be a ratio (0-1), but contains values > 1. Check IS_PERCENTAGE setting.")

    # Calculate male ratio if not present or to ensure consistency
    df_dictionary['bls_male_ratio'] = 1.0 - df_dictionary['bls_female_ratio']
    print("Ensured 'bls_female_ratio' and 'bls_male_ratio' (0-1 scale) columns exist.")

Ensured 'bls_female_ratio' and 'bls_male_ratio' (0-1 scale) columns exist.


In [23]:
# Validate bls_label categories
expected_labels = {'female-stereotyped', 'male-stereotyped', 'neutral', 'unknown'} # Allow 'unknown' as per exp1.py filter
actual_labels = set(df_dictionary['bls_label'].unique())
unexpected_labels = actual_labels - expected_labels
if unexpected_labels:
    print(f"Warning: Found unexpected values in 'bls_label' column: {unexpected_labels}")
    print(f"Expected values are typically within: {expected_labels}")
else:
    print("Values in 'bls_label' column are within expected categories.")

Values in 'bls_label' column are within expected categories.


In [None]:
# Check for duplicates
duplicates = df_dictionary[df_dictionary.duplicated(subset=['occupation'], keep=False)]
if not duplicates.empty:
    print(f"\nWarning: Duplicate occupation names found after stripping whitespace:")
    print(duplicates.sort_values('occupation'))

else:
     print("\nNo duplicate occupation names found.")


No duplicate occupation names found.


## 5. Final Selection and Display

In [26]:
# Select and order columns for the final output
final_columns = [
    'occupation',
    'bls_female_ratio',
    'bls_male_ratio',
    'bls_label'
]
# Add original proportion columns if they exist and are desired
if FEMALE_PROP_COL in df_dictionary.columns and 'bls_female_ratio' in df_dictionary.columns and FEMALE_PROP_COL != 'bls_female_ratio':
     final_columns.append(FEMALE_PROP_COL)
if 'bls_male' in df_dictionary.columns and 'bls_male_ratio' in df_dictionary.columns and 'bls_male' != 'bls_male_ratio':
      final_columns.append('bls_male')

In [27]:
# Ensure only existing columns are selected
final_columns = [col for col in final_columns if col in df_dictionary.columns]

In [28]:
df_final_dictionary = df_dictionary[final_columns].copy()

In [29]:
# --- Final Verification ---
print(f"\nFinal dictionary shape: {df_final_dictionary.shape}")
expected_rows = 100 # Based on paper description
if df_final_dictionary.shape[0] != expected_rows:
     print(f"Warning: Final dictionary has {df_final_dictionary.shape[0]} rows, but expected {expected_rows}. Please verify input data.")


Final dictionary shape: (100, 6)


In [30]:
print("\nFinal Dictionary Sample (first 5 rows):")
print(df_final_dictionary.head())


Final Dictionary Sample (first 5 rows):
            occupation  bls_female_ratio  bls_male_ratio           bls_label  \
0      chief executive             0.330           0.670             neutral   
1              manager             0.338           0.662             neutral   
2    marketing manager             0.642           0.358             neutral   
3        sales manager             0.329           0.671             neutral   
4  fundraising manager             0.701           0.299  female-stereotyped   

   bls_female  bls_male  
0       0.330     0.670  
1       0.338     0.662  
2       0.642     0.358  
3       0.329     0.671  
4       0.701     0.299  


In [31]:
print("\nStereotype Label Distribution:")
print(df_final_dictionary['bls_label'].value_counts())


Stereotype Label Distribution:
bls_label
neutral               47
female-stereotyped    27
male-stereotyped      26
Name: count, dtype: int64


In [32]:
print("\nDataframe Info:")
df_final_dictionary.info()


Dataframe Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   occupation        100 non-null    object 
 1   bls_female_ratio  100 non-null    float64
 2   bls_male_ratio    100 non-null    float64
 3   bls_label         100 non-null    object 
 4   bls_female        100 non-null    float64
 5   bls_male          100 non-null    float64
dtypes: float64(4), object(2)
memory usage: 4.8+ KB


## 6. Save Output

In [33]:
print(f"\nSaving validated occupation dictionary to {OUTPUT_CSV_FILE}...")
try:
    df_final_dictionary.to_csv(OUTPUT_CSV_FILE, index=False, encoding='utf-8')
    print("Dictionary saved successfully.")
except Exception as e:
    print(f"Error saving dictionary: {e}")


Saving validated occupation dictionary to /Users/jessie/Documents/Projects/master_thesis_llms_bias/results/occupation_dictionary_validated.csv...
Dictionary saved successfully.
