# Occupational Gender Bias

In [2252]:
# Import libraries
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import ttest_ind
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

## Occupations data

In [2253]:
# Read files
occupations = pd.read_excel('occupations.xlsx')
occupations.head()

Unnamed: 0,#,zh,en,hu,chatgpt,copilot,gemini,deepseek
0,1.0,模特,model,modell,2.5,1.8,1.0,-0.3
1,2.0,军人,soldier,katona,-2.8,-2.5,-2.5,-2.95
2,3.0,法医,pathologist,kórboncnok,-1.2,-1.5,-0.5,-1.8
3,4.0,董事长,CEO,vezérigazgató,-2.5,-2.0,-2.0,-2.1
4,5.0,高管,manager,menedzser,-1.0,-1.5,-1.5,-1.2


## Hungarian data

In [2254]:
# Read Hungarian participant data from prolific_hu.csv
df_prolific_hu = pd.read_csv('prolific_hu.csv')

# Return Participant id of rows where Status is RETURNED or REJECTED
rejects = df_prolific_hu[df_prolific_hu['Status'].isin(['RETURNED', 'REJECTED'])]['Participant id'].unique()

# create reject list
rejects_list = rejects.tolist()
print(f"Rejected participants: {rejects_list}")

df_prolific_hu.head()

Rejected participants: ['5e57a0020c3c6a14a1624031', '599494e7bf8bcf0001ab6973', '5f5a27482be30c0718bbf1e0']


Unnamed: 0,Submission id,Participant id,Status,Custom study tncs accepted at,Started at,Completed at,Reviewed at,Archived at,Time taken,Completion code,Total approvals,Age,Sex,Ethnicity simplified,Country of birth,Country of residence,Nationality,Language,Student status,Employment status
0,68708cf6b2a5c6baa48d6343,5ef60257cd680928de23ccae,APPROVED,Not Applicable,2025-07-11T04:03:02.613000Z,2025-07-11T04:07:42.625000Z,2025-07-15T04:08:31.855000Z,2025-07-11T04:07:43.031319Z,281.0,C1MO6037,377,29,Male,White,Hungary,Hungary,Hungary,Hungarian,Yes,Full-Time
1,68708d873d4d0b6d92e59937,5c48be0496d59b000183e68d,APPROVED,Not Applicable,2025-07-11T04:05:42.316000Z,2025-07-11T04:13:25.502000Z,2025-07-15T04:08:32.344000Z,2025-07-11T04:13:25.947923Z,464.0,C1MO6037,1457,48,Male,White,Hungary,Hungary,Hungary,Hungarian,No,Full-Time
2,68708f072c19330dab93cd7b,5d3449524e8363001735fc41,APPROVED,Not Applicable,2025-07-11T04:12:14.614000Z,2025-07-11T04:20:51.642000Z,2025-07-15T04:08:32.762000Z,2025-07-11T04:20:52.060603Z,518.0,C1MO6037,657,42,Male,White,Hungary,Hungary,Hungary,Hungarian,DATA_EXPIRED,Full-Time
3,68708f80de42cf279f3089d8,5a913d2cf0536100017196d8,APPROVED,Not Applicable,2025-07-11T04:13:57.716000Z,2025-07-11T04:16:47.135000Z,2025-07-15T04:08:33.160000Z,2025-07-11T04:16:47.592395Z,170.0,C1MO6037,29,27,Male,White,Hungary,Hungary,Hungary,Hungarian,No,Full-Time
4,68708fb5ab7d87dd294e6c74,5d3873197860c8001a106e02,APPROVED,Not Applicable,2025-07-11T04:16:49.056000Z,2025-07-11T04:21:24.387000Z,2025-07-15T04:08:33.533000Z,2025-07-11T04:21:24.787505Z,276.0,C1MO6037,2564,34,Male,White,Hungary,Hungary,Hungary,Hungarian,No,Full-Time


In [2255]:
# Read Hungarian data from hu.xlsx
df_hu = pd.read_excel('ratings_hu.xlsx')

# Rename Prolific ID to Participant id
df_hu.rename(columns={'Prolific ID': 'Participant id'}, inplace=True)

# Print number of participants
print(f"Number of participants: {df_hu.shape[0]}")

# Define Hungarian attention checks
hu_attention_checks = ['pincérnő', 'titkárnő', 'tanárnő', 'takarítónő', 'ápolónő', 'házvezetőnő']

# Show wo failed attention any checks, find participants who did not mark all hu_attention_checks columns as 3
failed_attention = df_hu[df_hu[hu_attention_checks].apply(lambda row: any(row != 'Teljesen női'), axis=1)]

# Show the failed participants (if any)
failed = failed_attention[['Participant id'] + hu_attention_checks] if not failed_attention.empty else print("All participants passed the attention checks.")
failed

Number of participants: 22


Unnamed: 0,Participant id,pincérnő,titkárnő,tanárnő,takarítónő,ápolónő,házvezetőnő
2,5a913d2cf0536100017196d8,Teljesen női,Nagyrészt női,Teljesen női,Teljesen női,Teljesen női,Teljesen női
9,599494e7bf8bcf0001ab6973,Teljesen női,Teljesen férfi,Teljesen női,Teljesen női,Teljesen női,Teljesen női
12,5e9ab5df2893af141343bce7,Inkább női,Nagyrészt női,Inkább női,Inkább női,Nagyrészt női,Inkább női
14,S,Teljesen női,Teljesen női,Teljesen női,Nagyrészt női,Teljesen női,Teljesen női


In [2256]:
# Remove those who failed miserably
df_hu = df_hu[df_hu['Participant id'] != '5e9ab5df2893af141343bce7']
df_hu = df_hu[df_hu['Participant id'] != '599494e7bf8bcf0001ab6973']

# Remove participants who were rejected
df_hu = df_hu[~df_hu['Participant id'].isin(rejects_list)]

# Print number of participants
print(f"Number of participants: {df_hu.shape[0]}")

# Drop attention checks
df_hu.drop(columns=hu_attention_checks, inplace=True)

Number of participants: 20


In [2257]:
# Define the mapping for ratings
rating_map = {
    'Teljesen férfi': -3,
    'Nagyrészt férfi': -2,
    'Inkább férfi': -1,
    'Semleges/egyenlő': 0,
    'Inkább női': 1,
    'Nagyrészt női': 2,
    'Teljesen női': 3
}

# Get columns to convert (skip non-rating columns)
rating_columns = df_hu.columns[8:]  # assuming first 8 columns are not ratings

# Replace and explicitly infer objects to avoid warning
for col in rating_columns:
    df_hu[col] = df_hu[col].map(rating_map)
    
# Drop rows where Prolific ID is in rejects_list
df_hu = df_hu[~df_hu['Participant id'].isin(rejects_list)]

# Rename Életkor to Age and Nem to Sex
df_hu.rename(columns={'Életkor': 'Age', 'Nem':'Gender'}, inplace=True)

# Count and print participants based on unique Participant id
num_participants = df_hu['Participant id'].nunique()
print(f'Number of participants: {num_participants}')

# Count the number of columns starting with the 8th.
num_columns = len(df_hu.columns) - 8
print(f'Number of words: {num_columns}')

# Show
df_hu.head()
    

Number of participants: 20
Number of words: 44


Unnamed: 0,ID,Start time,Completion time,Email,Name,Participant id,Age,Gender,modell,katona,...,dietetikus,tanár,rendőr,pilóta,recepciós,biztonsági őr,ügyész,kozmetikus,programozó,diák
0,1,2025-07-11 12:06:52,2025-07-11 12:07:43,anonymous,,5ef60257cd680928de23ccae,25-35,férfi,2,-3,...,0,0,-2,-1,0,-2,0,2,-1,0
1,2,2025-07-11 12:06:58,2025-07-11 12:13:33,anonymous,,5c48be0496d59b000183e68d,45-55,férfi,2,-2,...,0,0,-2,-2,1,-2,-1,3,-2,0
2,3,2025-07-11 12:14:03,2025-07-11 12:16:52,anonymous,,5a913d2cf0536100017196d8,25-35,férfi,0,-2,...,0,0,-1,0,0,-2,0,2,0,0
3,4,2025-07-11 12:12:24,2025-07-11 12:21:19,anonymous,,5d3449524e8363001735fc41,35-45,férfi,2,-2,...,1,2,-2,-2,2,-2,-1,3,-2,0
4,5,2025-07-11 12:16:52,2025-07-11 12:21:35,anonymous,,5d3873197860c8001a106e02,25-35,férfi,0,-2,...,0,0,-1,-1,0,-1,0,2,-2,0


### Demographics

In [2258]:
# Prepare gender and age counts and percentages
gender_counts = df_hu['Gender'].value_counts()
gender_labels = gender_counts.index
gender_labels_en = ['Male' if g == 'férfi' else 'Female' for g in gender_labels]
gender = gender_counts / gender_counts.sum() * 100

age_counts = df_hu['Age'].value_counts().sort_index()
age = age_counts / age_counts.sum() * 100

# Gender pie with clean hover labels
gender_pie = go.Pie(
    labels=gender_labels_en,
    values=gender.values,
    name='Gender',
    hole=0.33,
    title='Gender',
    customdata=np.stack([gender_labels_en, gender_counts.values, gender.values], axis=-1),
    textinfo='text',
    textfont=dict(size=28),
    text=[f"{label}<br>n={int(n)}<br>{p:.1f}%" for label, n, p in zip(gender_labels_en, gender_counts.values, gender.values)],
)

# Age pie with clean hover labels
age_pie = go.Pie(
    labels=age.index,
    values=age.values,
    name='Age',
    hole=0.33,
    title='Age',
    customdata=np.stack([age.index, age_counts.values, age.values], axis=-1),
    textinfo='text',
    textfont=dict(size=28),
    # hovertemplate='<b>%{customdata[0]}</b><br>n=%{customdata[1]}<br>%{customdata[2]:.1f}%<extra></extra>',
    text=[f"{label}<br>n={int(n)}<br>{p:.1f}%" for label, n, p in zip(age.index, age_counts.values, age.values)],
)

# Create subplot with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],
    subplot_titles=['Gender Distribution', 'Age Distribution']
)

fig.add_trace(gender_pie, 1, 1)
fig.add_trace(age_pie, 1, 2)

fig.update_layout(
    # title_text='Gender and Age Distribution',
    font = dict(family="Times New Roman, serif", size=32, color='black'),
    margin=dict(l=0, r=0, t=0, b=0),
    showlegend=False,
    legend=dict(
        font=dict(size=28),
        orientation='v',
        x=0.5,
        xanchor='center',
        y=1,
        yanchor='top'
    ),
    annotations=[
        dict(
            text=f'No. of participants: {df_hu.shape[0]}',
            x=0.5, y=0, xref='paper', yref='paper',
            showarrow=False, font=dict(size=28), align='center'
        )
    ]
)

fig.show()

# Save it as html
fig.write_html('demographics_hu.html')

# Save it as image
fig.write_image('demographics_hu.png', scale=3, width=1000, height=500)


In [2259]:
# Custom colors
female_color = "#C46BE1"
male_color = "#4090C2"
neutral_color = "#949494"

color_map = {
    'Men': female_color,
    'Women': male_color,
    'Equal': neutral_color
}

# Three shades of teal for age groups
gender_colors = [female_color, male_color]
age_shades = ['#77D6C6', '#4DAF9E', '#006B5F']

# Prepare gender and age counts and percentages
gender_counts = df_hu['Gender'].value_counts()
gender_labels = gender_counts.index
gender_labels_en = ['Male' if g == 'férfi' else 'Female' for g in gender_labels]
gender = gender_counts / gender_counts.sum() * 100

age_counts = df_hu['Age'].value_counts().sort_index()
age = age_counts / age_counts.sum() * 100

# Gender pie with custom colors
gender_pie = go.Pie(
    labels=gender_labels_en,
    values=gender.values,
    name='Gender',
    hole=0.33,
    title='Gender',
    marker=dict(colors=gender_colors),
    customdata=np.stack([gender_labels_en, gender_counts.values, gender.values], axis=-1),
    textinfo='text',
    textfont=dict(size=28),
    text=[f"{label}<br>n={int(n)}<br>{p:.1f}%" for label, n, p in zip(gender_labels_en, gender_counts.values, gender.values)],
)

# Age pie with custom teal shades (repeat if more age groups)
age_colors = (age_shades * ((len(age) // len(age_shades)) + 1))[:len(age)]
age_pie = go.Pie(
    labels=age.index,
    values=age.values,
    name='Age',
    hole=0.33,
    title='Age',
    marker=dict(colors=age_colors),
    customdata=np.stack([age.index, age_counts.values, age.values], axis=-1),
    textinfo='text',
    textfont=dict(size=28),
    text=[f"{label}<br>n={int(n)}<br>{p:.1f}%" for label, n, p in zip(age.index, age_counts.values, age.values)],
)

# Create subplot with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],
    subplot_titles=['Gender Distribution', 'Age Distribution']
)

fig.add_trace(gender_pie, 1, 1)
fig.add_trace(age_pie, 1, 2)

fig.update_layout(
    font = dict(family="Times New Roman, serif", size=32, color='black'),
    margin=dict(l=0, r=0, t=0, b=0),
    showlegend=False,
    legend=dict(
        font=dict(size=28),
        orientation='v',
        x=0.5,
        xanchor='center',
        y=1,
        yanchor='top'
    ),
    annotations=[
        dict(
            text=f'No. of participants: {df_hu.shape[0]}',
            x=0.5, y=0, xref='paper', yref='paper',
            showarrow=False, font=dict(size=28), align='center'
        )
    ]
)

fig.show()

# Save it as html
fig.write_html('demographics_hu.html')

# Save it as image
fig.write_image('demographics_hu.png', scale=3, width=1000, height=500)

### Analysis

#### Two Sample T-Test by Gender

In [2260]:
# Add a new column with the mean of the ratings for each participant
df_hu['Mean Rating'] = df_hu.iloc[:, 8:].mean(axis=1)

# Separate df_hu into male and female datasets based on the Gender column
df_hu_male = df_hu[df_hu['Gender'] == 'férfi']
df_hu_female = df_hu[df_hu['Gender'] == 'nő']

# Show the number of male and female participants
print(f"Number of male participants: {df_hu_male['Participant id'].nunique()}")
print(f"Number of female participants: {df_hu_female['Participant id'].nunique()}")

# Two Sample T-Test between male and female participants for each occupation word
results = []
for col in rating_columns:
    male_ratings = df_hu_male[col].dropna().astype(float)
    female_ratings = df_hu_female[col].dropna().astype(float)
    # Only test if both groups have at least 2 ratings
    if len(male_ratings) > 1 and len(female_ratings) > 1:
        t_stat, p_value = stats.ttest_ind(male_ratings, female_ratings, equal_var=False)
        mean_male = male_ratings.mean()
        mean_female = female_ratings.mean()
        results.append({
            'occupation': col,
            'mean_male': mean_male,
            'mean_female': mean_female,
            't_stat': t_stat,
            'p_value': p_value,
            'significant': p_value < 0.05,
            'marg_significant': 0.05 < p_value < 0.1
        })

df_ttest = pd.DataFrame(results)

# Add overall mean rating for sorting
df_ttest['overall_mean'] = (df_ttest['mean_male'] + df_ttest['mean_female']) / 2

# Sort by overall mean rating
df_ttest_sorted = df_ttest.sort_values('overall_mean', ascending=False)

# Prepare axis labels: bold for significant
def bold_label(row):
    occupation = row['occupation']
    
    if row.get('significant'):
        occupation = f"<b>{occupation}*</b>"
    
    if row.get('marg_significant'):
        occupation = f"<b>{occupation}+</b>"
    
    return occupation

df_ttest_sorted['occupation_label'] = df_ttest_sorted.apply(bold_label, axis=1)

# Plot
fig = go.Figure()
fig.add_trace(go.Bar(
    x=df_ttest_sorted['occupation_label'],
    y=df_ttest_sorted['mean_male'],
    name='Male',
    marker_color='#1f77b4'
))
fig.add_trace(go.Bar(
    x=df_ttest_sorted['occupation_label'],
    y=df_ttest_sorted['mean_female'],
    name='Female',
    marker_color='#e377c2'
))
fig.update_layout(
    barmode='group',
    title='Mean Ratings of Occupational Titles by Gender',
    xaxis_title='Hungarian data (significant*, and marginally significant+ in <b>bold</b>)',
    yaxis_title='Bias (Male - Female)',
    xaxis_tickangle=-45,
    template='plotly_white'
)
fig.show()

# Save results and plot
df_ttest_sorted.to_excel('occupations_gender_ttest_hu.xlsx', index=False)
fig.write_html('occupations_gender_ttest_hu.html')
fig.write_image('occupations_gender_ttest_hu.png', scale=3, width=1000, height=500)

Number of male participants: 9
Number of female participants: 11


#### Biases and biases by

In [2261]:
# Add a column 'bias_by' to df_ttest_sorted: 'Male' if abs(mean_male) > abs(mean_female), 'Female' if abs(mean_female) > abs(mean_male), 'Equal' if they are the same
df_ttest_sorted['bias_by'] = np.where(
    np.abs(df_ttest_sorted['mean_male']) > np.abs(df_ttest_sorted['mean_female']), 'Men',
    np.where(np.abs(df_ttest_sorted['mean_male']) < np.abs(df_ttest_sorted['mean_female']), 'Women', 'Equal')
)

# Add columns that show if the bias is male or female. Add male, if the overall mean is lower than 0, othervise add Female
df_ttest_sorted['bias'] = np.where(
    df_ttest_sorted['overall_mean'] > 0, 'Female',
    np.where(df_ttest_sorted['overall_mean'] < 0, 'Male', 'Equal')
)

# Calculate mean_difference (mean_male - mean_female), and add its absolute value
df_ttest_sorted['mean_difference'] = (df_ttest_sorted['mean_male'] - df_ttest_sorted['mean_female']).abs()


In [2262]:
# Plot
fig = go.Figure()
fig.add_trace(go.Bar(
    x=df_ttest_sorted['occupation'],
    y=df_ttest_sorted['overall_mean'],
    marker_color=df_ttest_sorted['bias_by'].map(color_map),
    customdata=df_ttest_sorted[['mean_difference', 'bias_by']],
    hovertemplate='Occupation: %{x}<br>Overall Mean: %{y:.2f}<br>Mean Diff: %{customdata[0]:.2f}<br>Greater Bias: %{customdata[1]}<extra></extra>'
))
# Layout
fig.update_layout(
    title='Mean Ratings by Dominantly Biased Group, Sorted by Mean Rating Difference',
    xaxis_title='Occupation (No significance)',
    yaxis_title='Overall Mean Rating',
    xaxis_tickangle=-45,
    template='plotly_white'
)

fig.show()

#### Confusion matrix of biases

In [2263]:
# Count unique values in 'bias' and 'bias_by' columns, ignoring 'Equal'
# Create confusion matrix, ignoring 'Equal'
conf_matrix = pd.crosstab(
    df_ttest_sorted.loc[df_ttest_sorted['bias'] != 'Equal', 'bias'],
    df_ttest_sorted.loc[df_ttest_sorted['bias_by'] != 'Equal', 'bias_by']
)

# Plot confusion matrix with plotly
fig = go.Figure(data=go.Heatmap(
    z=conf_matrix.values,
    x=conf_matrix.columns,
    y=conf_matrix.index,
    colorscale='Darkmint',
    text=conf_matrix.values,
    texttemplate="%{text}",
    hovertemplate="Bias: %{y}<br>Bias By: %{x}<br>Count: %{z}<extra></extra>"
))

fig.update_layout(
    title="Confusion Matrix of Bias vs Bias By<br>(Excluding 'Equal')",
    xaxis_title="Bias By",
    yaxis_title="Bias",
    width=400,
    height=400
)

fig.show()

#### Transpose

In [2264]:
# Transpose results, a prepare a copy with only the ratings
df_hu_ratings = df_hu[rating_columns].transpose()
df_hu = df_hu.transpose()

# Remove rows with index: ID, Start Time, Completion time, Email, Name, Participant id
df_hu = df_hu.drop(['ID', 'Start time', 'Completion time', 'Email', 'Name', 'Participant id'], axis=0) 

# Show
df_hu.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,10,11,13,14,15,16,17,18,19,20,21
Age,25-35,45-55,25-35,35-45,25-35,25-35,35-45,45-55,25-35,45-55,25-35,25-35,35-45,25-35,45-55,45-55,25-35,25-35,25-35,25-35
Gender,férfi,férfi,férfi,férfi,férfi,férfi,nő,nő,férfi,nő,férfi,nő,nő,nő,nő,nő,nő,nő,nő,férfi
modell,2,2,0,2,0,2,2,1,1,0,1,0,0,1,2,0,0,2,0,2
katona,-3,-2,-2,-2,-2,-2,-2,-2,-2,-1,-2,0,-1,-2,-2,-1,-2,-3,-1,-2
kórboncnok,0,-1,0,-1,-2,-1,-1,-2,-2,0,-1,0,-1,0,0,0,0,-2,0,-2
vezérigazgató,-1,-1,-1,-1,0,-1,-1,-2,-2,-1,-1,0,-2,-2,-2,0,-2,-2,-2,-1
menedzser,-1,0,-1,0,0,0,-1,-1,-2,0,-1,0,-1,0,-2,0,-1,-1,0,-1
nővér,3,3,2,3,3,3,3,2,3,1,2,1,2,3,3,1,1,2,2,1
szakács,0,0,-1,-1,-1,-2,-1,-1,-2,-1,-2,0,-1,-1,-2,0,-1,-1,0,0
felszolgáló,0,-1,0,1,0,0,0,0,1,0,0,0,0,0,2,0,0,1,0,0


#### One Sample T-test

In [2265]:
# One Sample T-test with standard deviation
results = []

for index, row in df_hu_ratings.iterrows():
    ratings = row.dropna().astype(float)
    t_stat, p_value = stats.ttest_1samp(ratings, popmean=0)
    mean_rating = ratings.mean()
    std_rating = ratings.std()
    results.append({
        'item': index,  # item name from index
        'mean': mean_rating,
        'std': std_rating,
        't_stat': t_stat,
        'p_value': p_value,
        'significant': p_value < 0.05
    })

df_results = pd.DataFrame(results)

# Print significant results
print(df_results[df_results['significant']][['item', 'mean', 'std', 'p_value']])

# Filter for non-significant ratings
not_significant = df_results[~df_results['significant']]

# Get the min and max of the mean ratings where not significant
mean_min = not_significant['mean'].min()
mean_max = not_significant['mean'].max()

# Print the range of mean ratings where the rating is not significant
print(f"\nRange of mean ratings where the rating is not significant: {mean_min} to {mean_max}")

               item  mean       std       p_value
0            modell  1.00  0.917663  1.055453e-04
1            katona -1.80  0.695852  4.798108e-10
2        kórboncnok -0.80  0.833509  3.931739e-04
3     vezérigazgató -1.25  0.716350  2.419167e-07
4         menedzser -0.65  0.670820  3.581297e-04
5             nővér  2.20  0.833509  3.420203e-10
6           szakács -0.90  0.718185  2.099888e-05
8          könyvelő  0.50  0.888523  2.099150e-02
9        professzor -0.90  0.911910  2.982605e-04
10          építész -1.25  0.850696  2.726994e-06
11            tudós -0.40  0.598243  7.523484e-03
13        pénztáros  1.05  0.825578  1.752416e-05
14             bíró -0.25  0.444262  2.099150e-02
15           munkás -1.40  0.820783  3.373973e-07
16        vízimentő -1.10  0.852242  1.460823e-05
18          tűzoltó -2.20  0.767772  8.478271e-11
19           mérnök -1.15  0.812728  4.506255e-06
20          rendező -0.85  0.745160  6.342961e-05
21         takarító  1.25  1.069924  4.823460e-05


In [2266]:
# # Sort by mean for better readability
# df_results_sorted = df_results.sort_values(by='mean')

# # Create color labels
# df_results_sorted['Significance'] = df_results_sorted['significant'].map({True: 'significant', False: 'not significant'})

# # Plot
# fig = px.bar(
#     df_results_sorted,
#     x='item',
#     y='mean',
#     color='Significance',
#     color_discrete_map={'significant': 'crimson', 'not significant': 'lightgray'},
#     title='One Sample T-test of Raw Ratings',
#     labels={'item': 'Item', 'mean': 'Mean Rating'},
#     # On hover, Show the item, mean rating, and p-value    
#     hover_data=['item', 'mean', 'p_value'],
#     # Edit hovertemplate to show item, mean, and p-value
    
# )

# fig.update_layout(
#     xaxis_tickangle=-45,
#     template='plotly_white',
#     margin=dict(l=0, r=0, t=40, b=0),
#     legend=dict(
#         orientation='h',
#         yanchor='bottom',
#         y=0,
#         xanchor='center',
#         x=0.5,
#         bgcolor='rgba(220, 220, 220, 0.25)',
#     )
# )

# # Show
# fig.show()

# # Save it as html
# fig.write_html('occupations_ttest_hu.html')

# # Save it as image  
# fig.write_image('occupations_ttest_hu.png', scale=3, width=1000, height=400)

#### Merge

In [2267]:
# Turn index column into a column called 'hu'
df_hu.reset_index(inplace=True)
df_hu.rename(columns={'index': 'hu'}, inplace=True)

# Merge df_hu and occupations on the 'hu' column
df_hu = pd.merge(df_hu, occupations, on='hu', how='left')

# Reorder columns so the dataframe starts with 'hu' 'en', 'zh', and so on
df_hu = df_hu[['#', 'hu', 'en', 'zh'] + [col for col in df_hu.columns if col not in ['#', 'hu', 'en' , 'zh']]]

# Merge df_hu and df_results on the 'hu' column
df_hu = pd.merge(df_hu, df_results, left_on='hu', right_on='item', how='left')

# Sort all occupations by their average ratings
df_hu = df_hu.sort_values(by='mean', ascending=False)

# Drop redundant columns
df_hu.drop(columns=['#', 'item', 't_stat'], inplace=True)

# Rename the columns for clarity
df_hu.rename(columns={'mean': 'hu_mean',
                      't_stat': 'hu_t_stat',
                      'std': 'hu_std',
                      'p_value': 'hu_p_value',
                      'significant': 'hu_significant'}, inplace=True)


# Save df_hu as an Excel file
df_hu.to_excel('occupations_hu.xlsx', index=False)

# Show the final DataFrame
df_hu.head()

#Show
df_hu.tail()

Unnamed: 0,hu,en,zh,0,1,2,3,4,5,6,...,20,21,chatgpt,copilot,gemini,deepseek,hu_mean,hu_std,hu_p_value,hu_significant
41,biztonsági őr,security guard,保安,-2,-2,-2,-2,-1,-2,-2,...,-1,-2,-2.5,-1.8,-2.5,-2.85,-1.9,0.447214,8.074624e-14,True
20,tűzoltó,firefighter,消防员,-2,-3,0,-2,-2,-3,-2,...,-2,-2,-2.9,-2.5,-2.8,-2.98,-2.2,0.767772,8.478271e-11,True
0,Age,,,25-35,45-55,25-35,35-45,25-35,25-35,35-45,...,25-35,25-35,,,,,,,,
1,Gender,,,férfi,férfi,férfi,férfi,férfi,férfi,nő,...,nő,férfi,,,,,,,,
46,Mean Rating,,,-0.204545,-0.25,-0.204545,0.272727,-0.204545,-0.136364,-0.340909,...,-0.25,-0.272727,,,,,,,,


### Plot

In [2268]:
# Make an explicit copy of the filtered DataFrame
df_hu_plot = df_hu[~df_hu['hu'].isin(['Age', 'Gender', 'Mean Rating'])].copy()

# Assign the 'bias' column safely
df_hu_plot.loc[:, 'bias'] = df_hu_plot['hu_mean'].apply(
    lambda x: 'Female' if x > mean_max else ('Male' if x < mean_min else 'Neutral')
)

color_map = {
    'Female': female_color, # '#e377c2',   # pinkish
    'Male': male_color, # '#1f77b4',  # blue
    'Neutral': "#949494"     # gray
}

# Sort df_hu_plot by mean value before plotting
df_hu_plot_sorted = df_hu_plot.sort_values('hu_mean', ascending=False)

df_hu_plot_sorted['hu_label'] = df_hu_plot_sorted.apply(
    lambda row: f"{row['hu']} ({row['en']})" if pd.notna(row['en']) else row['hu'], axis=1
)

fig = go.Figure()

for bias in ['Female', 'Neutral', 'Male']:
    subset = df_hu_plot_sorted[df_hu_plot_sorted['bias'] == bias]
    fig.add_trace(go.Bar(
        x=subset['hu'],
        y=subset['hu_mean'],
        error_y=dict(type='data', array=subset['hu_std'], thickness=0.75),
        name=bias,
        marker_color=color_map[bias],
        hovertemplate=(
            'Hungarian: %{x}<br>'
            'Mean Rating: %{y:.2f}<br>'
            'Standard Deviation: %{error_y.array:.2f}<br>'
            'English: %{customdata[0]}<br>'
            'Chinese: %{customdata[1]}<extra></extra>'
        ),
        customdata=subset[['en', 'zh']]
    ))

fig.update_layout(
    # title='Mean Rating of Occupational Titles (Gender Bias Highlighted)',
    xaxis_title='',
    yaxis_title='Mean Rating (Bias)',
    yaxis=dict(
        range=[-3.05, 3.05],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        title='Mean Rating (Bias)'
    ),
    xaxis_tickangle=-45,
    template='plotly_white',
    font = dict(family="Times New Roman, serif", size=16, color='black'),
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(220, 220, 220, 0.25)',
    )
)

fig.show()

# Save this as a html and image
fig.write_html('occupations_hu.html')
fig.write_image('occupations_hu.png', scale=3, width=1000, height=400)


In [2269]:
# Plot the Hungarian data with LLM reference lines using plotly.graph_objects (go)
fig = go.Figure()

# Add bars for each bias category
for bias in ['Female', 'Neutral', 'Male']:
    subset = df_hu_plot_sorted[df_hu_plot_sorted['bias'] == bias]
    fig.add_trace(go.Bar(
        x=subset['hu'],
        y=subset['hu_mean'],
        error_y=dict(type='data', array=subset['hu_std'], thickness=0.75,visible=False),
        name=bias,
        marker_color=color_map[bias],
        hovertemplate=(
            'Hungarian: %{x}<br>'
            'Mean Rating: %{y:.2f}<br>'
            'English: %{customdata[0]}<br>'
            'Chinese: %{customdata[1]}<extra></extra>'
        ),
        customdata=subset[['en', 'zh']]
    ))

# Add ChatGPT reference line
fig.add_trace(go.Scatter(
    x=df_hu_plot_sorted['hu'],
    y=df_hu_plot_sorted['chatgpt'],
    mode='lines+markers',
    name='ChatGPT',
    line=dict(color='#00A67E', width=2, dash='dot'),
    marker=dict(symbol='hexagon', size=8),
    hovertemplate='Hungarian: %{x}<br>English: %{customdata[0]}<br>ChatGPT: %{y:.2f}<extra></extra>',
    customdata=df_hu_plot_sorted[['en']]
))

# Add Copilot reference line
fig.add_trace(go.Scatter(
    x=df_hu_plot_sorted['hu'],
    y=df_hu_plot_sorted['copilot'],
    mode='lines+markers',
    name='Copilot',
    line=dict(color="#F2AD22", width=2, dash='dot'),
    marker=dict(symbol='pentagon', size=8),
    hovertemplate='Hungarian: %{x}<br>English: %{customdata[0]}<br>Copilot: %{y:.2f}<extra></extra>',
    customdata=df_hu_plot_sorted[['en']]
))

# Add Gemini reference line
fig.add_trace(go.Scatter(
    x=df_hu_plot_sorted['hu'],
    y=df_hu_plot_sorted['gemini'],
    mode='lines+markers',
    name='Gemini',
    line=dict(color='#9177C7', width=2, dash='dot'),
    marker=dict(symbol='star-diamond', size=8),
    hovertemplate='Hungarian: %{x}<br>English: %{customdata[0]}<br>Gemini: %{y:.2f}<extra></extra>',
    customdata=df_hu_plot_sorted[['en']]
))

# Add Deepseek reference line
fig.add_trace(go.Scatter(
    x=df_hu_plot_sorted['hu'],
    y=df_hu_plot_sorted['deepseek'],
    mode='lines+markers',
    name='Deepseek',
    line=dict(color='#3C5DFF', width=2, dash='dot'),
    marker=dict(symbol='circle', size=8),
    hovertemplate='Hungarian: %{x}<br>English: %{customdata[0]}<br>Deepseek: %{y:.2f}<extra></extra>',
    customdata=df_hu_plot_sorted[['en']]
))

fig.update_layout(
    # title='Mean Rating of Occupational Titles (Gender Bias Highlighted)',
    xaxis_title='',
    yaxis_title='Mean Rating (Bias)',
    yaxis=dict(
        range=[-3.1, 3.1],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        title='Mean Rating (Bias)'
    ),
    xaxis_tickangle=-45,
    template='plotly_white',
    font = dict(family="Times New Roman, serif", size=16, color='black'),
    margin=dict(l=0, r=0, t=20, b=0),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0.95,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(220, 220, 220, 0.25)',
    )
)

fig.show()

# Save as HTML and image
fig.write_html('occupations_hu_llms.html')
fig.write_image('occupations_hu_llms.png', scale=3, width=1000, height=400)

## Chinese data

In [2270]:
# Read Chinese data from hu.xlsx
df_zh = pd.read_excel('ratings_zh.xlsx')

# Define Chinese attention checks #######################################
zh_attention_checks = ['妈妈', '女画家', '女作家', '爸爸', '男演员', '男作家']

# Drop attention checks
df_zh.drop(columns=zh_attention_checks, inplace=True)

# Define the mapping for ratings
rating_map = {
    '完全由男性担任': -3,
    '大多由男性担任': -2,
    '较多由男性担任': -1,
    '男女比例大致相当': 0,
    '较多由女性担任': 1,
    '大多由女性担任': 2,
    '完全由女性担任': 3
}

# Get columns to convert (skip non-rating columns)
rating_columns = df_zh.columns[1:]  # assuming first 8 columns are not ratings

# Replace and explicitly infer objects to avoid warning
for col in rating_columns:
    df_zh[col] = df_zh[col].map(rating_map)

# Count and print participants based on unique Participant id
num_participants = df_zh['ID'].nunique()
print(f'Number of participants: {num_participants}')

# Count the number of columns starting with the 8th.
num_columns = len(df_zh.columns)
print(f'Number of words: {num_columns}')

# Show
df_zh.head()

Number of participants: 17
Number of words: 41


Unnamed: 0,ID,警察,秘书,教授,护士,高管,教师,前台,工人,公关,...,程序员,保安,导演,军人,董事长,消防员,科学家,检察官,救生员,建筑师
0,1,-2,1,0,1,0,1,1,-1,0,...,0,-1,0,-2,0,-2,0,0,0,0
1,4,-2,2,0,2,0,0,2,0,0,...,-2,-2,-1,-2,-2,-2,-2,-1,-2,-2
2,5,-1,2,0,2,0,1,1,-2,0,...,-2,-2,-1,-2,-1,-1,-1,-1,-2,-1
3,6,-1,0,-1,2,-1,1,1,0,0,...,-2,-2,-1,-2,-1,-2,-1,0,-1,-1
4,7,-2,2,-1,2,-1,2,2,-3,0,...,-2,-2,-1,-1,-2,-2,-1,-1,-2,-2


### Analysis

#### Two Sample T-Test by Gender

#### Transpose

In [2271]:
# Transpose results, a prepare a copy with only the ratings
df_zh_ratings = df_zh[rating_columns].transpose()
df_zh = df_zh.transpose()

# Remove rows with index: ID, Start Time, Completion time, Email, Name, Participant id
df_zh = df_zh.drop(['ID'], axis=0) 

# Show
df_zh.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
警察,-2,-2,-1,-1,-2,-2,-2,-2,-1,-1,-2,-3,-2,-3,-2,-2,-2
秘书,1,2,2,0,2,2,2,1,-1,2,2,3,1,1,1,2,1
教授,0,0,0,-1,-1,-2,-1,0,0,-1,0,-1,-2,-2,0,0,0
护士,1,2,2,2,2,2,2,2,2,2,1,3,2,2,2,2,2
高管,0,0,0,-1,-1,-2,-2,-1,0,-1,0,-2,-1,-1,0,0,-1


### One Sample T-test

In [2272]:
# One Sample T-test
results = []

for index, row in df_zh_ratings.iterrows():
    ratings = row.dropna().astype(float)
    t_stat, p_value = stats.ttest_1samp(ratings, popmean=0)
    mean_rating = ratings.mean()
    results.append({
        'item': index,  # item name from index
        'mean': mean_rating,
        't_stat': t_stat,
        'p_value': p_value,
        'significant': p_value < 0.05
    })

df_results = pd.DataFrame(results)

# Print significant results
print(df_results[df_results['significant']][['item', 'mean', 'p_value']])

# Filter for non-significant ratings
not_significant = df_results[~df_results['significant']]

# Get the min and max of the mean ratings where not significant
mean_min = not_significant['mean'].min()
mean_max = not_significant['mean'].max()

# Print the range of mean ratings where the rating is not significant
print(f"\nRange of mean ratings where the rating is not significant: {mean_min} to {mean_max}")


   item      mean       p_value
0    警察 -1.882353  6.926729e-10
1    秘书  1.411765  1.280319e-05
2    教授 -0.647059  3.701534e-03
3    护士  1.941176  2.762582e-12
4    高管 -0.764706  6.924293e-04
5    教师  1.117647  2.710431e-04
6    前台  1.588235  3.801807e-07
7    工人 -1.588235  2.510694e-05
8    公关  0.588235  1.319208e-02
9    幼师  2.000000  1.827768e-11
10   模特  1.117647  1.395229e-04
11   护工  1.058824  1.005465e-03
12   保姆  2.117647  1.092694e-09
13   会计  0.705882  1.336629e-02
14  工程师 -1.352941  2.524732e-06
15   保洁  1.294118  2.958650e-04
17  导购员  1.294118  2.697127e-05
18  美容师  1.882353  6.926729e-10
19  服务员  0.882353  6.206554e-04
20  乘务员  1.117647  1.395229e-04
21  理发师 -1.058824  2.510694e-05
22  空服员  1.000000  7.969979e-04
23  售票员  1.000000  7.969979e-04
24   厨师 -1.529412  1.601761e-07
25  营养师  0.588235  1.319208e-02
26  家政员  1.235294  4.096206e-03
27  收银员  1.529412  6.696565e-07
28   医生 -0.705882  1.803579e-02
29   法医 -0.823529  4.096206e-03
30  程序员 -1.470588  8.348424e-06
31   保安 

In [2273]:
# Sort by mean for better readability
df_results_sorted = df_results.sort_values(by='mean')

# Create color labels
df_results_sorted['significance'] = df_results_sorted['significant'].map({True: 'Significant', False: 'Not Significant'})

# Plot
fig = px.bar(
    df_results_sorted,
    x='item',
    y='mean',
    color='significance',
    color_discrete_map={'Significant': 'crimson', 'Not Significant': 'lightgray'},
    title='One Sample T-test of Ratings from Likert Scale',
    labels={'item': 'Item', 'mean': 'Mean Rating'},
    hover_data=['p_value']
)

fig.update_layout(
    xaxis_tickangle=-45,
    yaxis_title='Mean Rating (Bias)',
    xaxis_title='Item',
    template='plotly_white'
)

fig.show()

# Save it as html
fig.write_html('occupations_ttest_zh.html')

# Save it as image
fig.write_image('occupations_ttest_zh.png', scale=3, width=1000, height=500)

### Merge

In [2274]:
# Turn index column into a column called 'zh'
df_zh.reset_index(inplace=True)
df_zh.rename(columns={'index': 'zh'}, inplace=True)

# Merge df_zh and occupations on the 'zh' column
df_zh = pd.merge(df_zh, occupations, on='zh', how='left')

# Reorder columns so the dataframe starts with 'hu' 'en', 'zh', and so on
df_zh = df_zh[['#', 'hu', 'en', 'zh'] + [col for col in df_zh.columns if col not in ['#', 'hu', 'en' , 'zh']]]

# Merge df_zh and df_results on the 'zh' column
df_zh = pd.merge(df_zh, df_results, left_on='zh', right_on='item', how='left')

# Sort all occupations by their average ratings
df_zh = df_zh.sort_values(by='mean', ascending=False)

# Drop the 'item' column as it is redundant now
df_zh.drop(columns=['item', 't_stat'], inplace=True)

# Rename the columns for clarity
df_zh.rename(columns={'mean': 'zh_mean',
                      't_stat': 'zh_t_stat',
                      'p_value': 'zh_p_value',
                      'significant': 'zh_significant'}, inplace=True)

# Save df_zh as an Excel file
df_zh.to_excel('occupations_zh.xlsx', index=False)

# Show the final DataFrame
df_zh.head()

Unnamed: 0,#,hu,en,zh,0,1,2,3,4,5,...,14,15,16,chatgpt,copilot,gemini,deepseek,zh_mean,zh_p_value,zh_significant
12,,,domestic helper,保姆,2,2,1,2,2,3,...,2,3,1,,,,,2.117647,1.092694e-09,True
9,,,kindergarten teacher,幼师,2,2,2,1,2,2,...,2,2,2,,,,,2.0,1.827768e-11,True
3,6.0,nővér,nurse,护士,1,2,2,2,2,2,...,2,2,2,2.9,3.0,3.0,2.85,1.941176,2.762582e-12,True
18,48.0,kozmetikus,beautician,美容师,2,2,2,1,2,2,...,2,2,1,2.9,2.5,3.0,2.75,1.882353,6.926729e-10,True
6,45.0,recepciós,receptionist,前台,1,2,1,1,2,2,...,2,3,0,2.1,1.8,2.5,2.1,1.588235,3.801807e-07,True


### Plot

In [2275]:
# Remove operational rows
df_zh_plot = df_zh[~df_zh['zh'].isin(['ID'])]

# Add a color column based on rating sign: Female (rating > 0), Male (rating < 0), neutral (rating == 0)
df_zh_plot['bias'] = df_zh_plot['zh_mean'].apply(
    lambda x: 'Female' if x > mean_max else ('Male' if x < mean_min else 'Neutral')
)

color_map = {
    'Female': '#e377c2',   # pinkish
    'Male': '#1f77b4',  # blue
    'Neutral': '#7f7f7f'     # gray
}

fig = px.bar(
    df_zh_plot,
    x='zh',
    y='zh_mean',
    color='bias',
    color_discrete_map = color_map,
    title='Mean Rating of Occupational Titles (Gender Bias Highlighted)',
    labels={'zh': 'Chinese', 'en': 'English', 'bias': 'Bias', 'zh_mean': 'Rating',},
    hover_data=['zh', 'en', 'zh_mean']
)
fig.update_layout(
    xaxis_tickangle=-45,
    yaxis=dict(
        range=[-3, 3],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        title='Mean Rating'
    )
)
fig.show()

# Save this as a html file
fig.write_html('occupations_zh.html')
fig.write_image('occupations_zh.png', scale=3, width=1000, height=500)

In [2276]:
### UPDATE AI DATA WITH CHINESE DATA ###

# Plot the Chinese data with LLM reference lines
fig = px.bar(
    df_zh_plot,
    x='zh',
    y='zh_mean',
    color='bias',
    color_discrete_map=color_map,
    title='Mean Rating of Occupational Titles (Gender Bias Highlighted)',
    labels={'zh': 'Chinese', 'en': 'English', 'bias': 'Bias', 'zh_mean': 'Rating'},
    hover_data=['zh', 'en', 'zh_mean']
)

# Add ChatGPT reference line
fig.add_trace(
    go.Scatter(
        x=df_zh_plot['zh'],
        y=df_zh_plot['chatgpt'],
        mode='lines+markers',
        name='ChatGPT',
        line=dict(color='#00A67E', width=2, dash='dot'),
        marker=dict(symbol='circle', size=6),
        hovertemplate='Chinese: %{x}<br>English: %{customdata[0]}<br>ChatGPT: %{y:.2f}<extra></extra>',
        customdata=df_zh_plot[['en']]
    )
)

# Add Copilot reference line
fig.add_trace(
    go.Scatter(
        x=df_zh_plot['zh'],
        y=df_zh_plot['copilot'],
        mode='lines+markers',
        name='Copilot',
        line=dict(color="#F2AD22", width=2, dash='dot'),
        marker=dict(symbol='circle', size=6),
        hovertemplate='Chinese: %{x}<br>English: %{customdata[0]}<br>Copilot: %{y:.2f}<extra></extra>',
        customdata=df_zh_plot[['en']]
    ))

# Add Gemini reference line
fig.add_trace(
    go.Scatter(
        x=df_zh_plot['zh'],
        y=df_zh_plot['gemini'],
        mode='lines+markers',
        name='Gemini',
        line=dict(color='#9177C7', width=2, dash='dot'),
        marker=dict(symbol='circle', size=6),
        hovertemplate='Chinese: %{x}<br>English: %{customdata[0]}<br>Gemini: %{y:.2f}<extra></extra>',
        customdata=df_zh_plot[['en']]
    ))

# Add Deepseek reference line
fig.add_trace(
    go.Scatter(
        x=df_zh_plot['zh'],
        y=df_zh_plot['deepseek'],
        mode='lines+markers',
        name='Deepseek',
        line=dict(color='#3C5DFF', width=2, dash='dot'),
        marker=dict(symbol='circle', size=6),
        hovertemplate='Chinese: %{x}<br>English: %{customdata[0]}<br>Deepseek: %{y:.2f}<extra></extra>',
        customdata=df_zh_plot[['en']]
    ))

fig.update_layout(
    xaxis_tickangle=-45,
    yaxis=dict(
        range=[-3, 3],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        title='Mean Rating'
    ))

fig.show()

# Save as HTML and image
fig.write_html('occupations_zh_llms.html')
fig.write_image('occupations_zh_llms.png', scale=3, width=1000, height=500)

## Cross-linguistic Comparison

### Check comparability

In [2277]:
# Get the set of English occupation names from both dataframes
en_hu = set(df_hu['en'].dropna()) if 'en' in df_hu.columns else set()
en_zh = set(df_zh['en'].dropna()) if 'en' in df_zh.columns else set()

# Items only in Hungarian data
only_in_hu = en_hu - en_zh
# Items only in Chinese data
only_in_zh = en_zh - en_hu
# Items in both
in_both = en_hu & en_zh

print(f"Items only in Hungarian data ({len(only_in_hu)}): {sorted(only_in_hu)}\n")
print(f"Items only in Chinese data ({len(only_in_zh)}): {sorted(only_in_zh)}\n")
print(f"Items in both ({len(in_both)}): {sorted(in_both)}")

Items only in Hungarian data (7): ['(male) nurse', 'HR specialist', 'farmer', 'gardener', 'pilot', 'student', 'waiter*']

Items only in Chinese data (3): ['domestic helper', 'flight attendant*', 'kindergarten teacher']

Items in both (37): ['CEO', 'PR specialist', 'accountant', 'architect', 'beautician', 'caretaker', 'cashier', 'chef', 'cleaner', 'dietitian', 'director', 'doctor', 'engineer', 'firefighter', 'flight attendant', 'hairdresser', 'housekeeper', 'judge', 'lifeguard', 'manager', 'model', 'nurse', 'pathologist', 'police officer', 'professor', 'programmer', 'prosecutor', 'receptionist', 'scientist', 'secretary', 'security guard', 'shop assistant', 'soldier', 'teacher', 'ticketseller', 'waiter', 'worker']


### Unified dataframe

In [2278]:
# Create a unified DataFrame with all unique occupation words (by English name)
all_en = sorted(en_hu | en_zh)

# Merge Hungarian and Chinese data on 'en' (English occupation name)
df_hu_part = df_hu[['en', 'hu', 'hu_mean', 'hu_significant']].copy()
df_zh_part = df_zh[['en', 'zh', 'zh_mean', 'zh_significant']].copy()

# Outer merge to include all occupations from both datasets
df_unified = pd.DataFrame({'en': all_en})
df_unified = df_unified.merge(df_hu_part, on='en', how='left')
df_unified = df_unified.merge(df_zh_part, on='en', how='left')

# Mark in a new column if both are significant
df_unified['both_significant'] = df_unified.apply(
    lambda row: row['hu_significant'] and row['zh_significant'] if pd.notna(row['hu_significant']) and pd.notna(row['zh_significant']) else False, axis=1)

# Drop hu_significant and zh_significant columns
df_unified.drop(columns=['hu_significant', 'zh_significant'], inplace=True)

# Add new column that shows the difference between hu_mean and zh_mean
df_unified['mean_difference'] = df_unified.apply(
    lambda row: row['hu_mean'] - row['zh_mean'] if pd.notna(row['hu_mean']) and pd.notna(row['zh_mean']) else None, axis=1)

# Sort by this
df_unified = df_unified.sort_values(by='mean_difference', ascending=False)

# Count and print the number of unique occupations in the unified DataFrame
num_unique_occupations = df_unified['en'].nunique()
print(f'Number of unique occupations in unified DataFrame: {num_unique_occupations}')

# Export this as an Excel file
df_unified.to_excel('occupations_unified.xlsx', index=False)

# Show the unified DataFrame
df_unified.head()

Number of unique occupations in unified DataFrame: 47


Unnamed: 0,en,hu,hu_mean,zh,zh_mean,both_significant,mean_difference
21,hairdresser,fodrász,1.0,理发师,-1.058824,True,2.058824
25,lifeguard,vízimentő,-1.1,救生员,-1.764706,True,0.664706
36,scientist,tudós,-0.4,科学家,-1.058824,True,0.658824
9,chef,szakács,-0.9,厨师,-1.529412,True,0.629412
22,housekeeper,házvezető,1.8,家政员,1.235294,True,0.564706


## Plot comparison

In [2279]:
# Prepare comparison DataFrame for occupations present in both datasets
df_compare = df_hu[df_hu['en'].isin(in_both)][['en', 'hu_mean', 'hu']].merge(
    df_zh[df_zh['en'].isin(in_both)][['en', 'zh_mean', 'zh']], on='en', suffixes=('_hu', '_zh')
)

# Sort by the average of the two means for better visualization
df_compare['mean_avg'] = (df_compare['hu_mean'] + df_compare['zh_mean']) / 2
df_compare = df_compare.sort_values('mean_avg', ascending=False)

# Create bar plot
fig = go.Figure()

fig.add_trace(go.Bar(
    x=df_compare['en'],
    y=df_compare['hu_mean'],
    name='Hungarian',
    marker_color="#1f7211",
    hovertemplate='Hungarian: %{customdata[0]}<br>Mean: %{y:.2f}',
    customdata=df_compare[['hu']]
))

fig.add_trace(go.Bar(
    x=df_compare['en'],
    y=df_compare['zh_mean'],
    name='Chinese',
    marker_color="#e81818",
    hovertemplate='Chinese: %{customdata[0]}<br>Mean: %{y:.2f}',
    customdata=df_compare[['zh']]
))

fig.update_layout(
    barmode='group',
    title='Comparison of Occupational Gender Bias Ratings by Language (Hungarian vs. Chinese)',
    xaxis_title='Occupation',
    yaxis_title='Mean Rating',
    xaxis_tickangle=-45,
    template='plotly_white'
)

fig.show()

# Save as html
fig.write_html('occupations_comparison.html')

# Save as image
fig.write_image('occupations_comparison.png', scale=3, width=1000, height=500)

### Test significance of the differences

In [2280]:
# Delete the last 7 clumns from df_hu and df_zh
columns_to_drop_hu = ['chatgpt', 'copilot', 'gemini', 'deepseek', 'hu_mean', 'hu_p_value', 'hu_significant']
columns_to_drop_zh = ['#', 'chatgpt', 'copilot', 'gemini', 'deepseek', 'zh_mean', 'zh_p_value', 'zh_significant']
df_hu_ratings = df_hu.drop(columns=columns_to_drop_hu)
df_zh_ratings = df_zh.drop(columns=columns_to_drop_zh)
df_hu_ratings.head()
df_zh_ratings.head()

Unnamed: 0,hu,en,zh,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
12,,domestic helper,保姆,2,2,1,2,2,3,3,2,1,2,2,3,3,2,2,3,1
9,,kindergarten teacher,幼师,2,2,2,1,2,2,2,2,1,2,2,3,2,3,2,2,2
3,nővér,nurse,护士,1,2,2,2,2,2,2,2,2,2,1,3,2,2,2,2,2
18,kozmetikus,beautician,美容师,2,2,2,1,2,2,1,1,2,2,2,2,3,3,2,2,1
6,recepciós,receptionist,前台,1,2,1,1,2,2,2,1,1,3,2,2,1,1,2,3,0


In [2281]:
# Get a list of occupation in english that are common in both Hungarian and Chinese datasets
common_en = sorted(set(df_hu['en']).intersection(set(df_zh['en'])))

results = []

for en_name in common_en:
    # Get the occupation row for each language
    hu_row = df_hu_ratings[df_hu_ratings['en'] == en_name]
    zh_row = df_zh_ratings[df_zh_ratings['en'] == en_name]
    
    # Get raw ratings (drop non-numeric columns)
    hu_ratings = hu_row.drop(columns=['en', 'hu', 'zh'], errors='ignore').values.flatten()
    zh_ratings = zh_row.drop(columns=['en', 'hu', 'zh'], errors='ignore').values.flatten()
    
    # Remove NaNs and convert to float
    hu_ratings = pd.to_numeric(pd.Series(hu_ratings).dropna(), errors='coerce')
    zh_ratings = pd.to_numeric(pd.Series(zh_ratings).dropna(), errors='coerce')
    
    # Calculate means
    hu_mean = hu_ratings.mean()
    zh_mean = zh_ratings.mean()
    diff = hu_mean - zh_mean
    
    # Perform t-test
    t_stat, p_value = ttest_ind(hu_ratings, zh_ratings, equal_var=False)
    
    results.append({
        'en': en_name,
        'hu': hu_row['hu'].values[0] if not hu_row.empty else None,
        'zh': zh_row['zh'].values[0] if not zh_row.empty else None,
        'hu_mean': hu_mean,
        'zh_mean': zh_mean,
        'mean_difference': diff,
        't_stat': t_stat,
        'p_value': p_value,
        'significant': p_value is not None and p_value < 0.05,
        'marg_significant': p_value is not None and 0.05 < p_value < 0.1
    })

df_plot = pd.DataFrame(results)

# Create axis labels: bold for significant*, bold+ for marginally significant+
def bold_label(row):
    label = row['en']
    if row['significant']:
        label = f"<b>{label}*</b>"
    elif row['marg_significant']:
        label = f"<b>{label}+</b>"
    return label

df_plot['en_label'] = df_plot.apply(bold_label, axis=1)

# Sort by the mean average
df_plot['mean_avg'] = (df_plot['hu_mean'] + df_plot['zh_mean']) / 2
df_plot = df_plot.sort_values('mean_avg', ascending=False)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=df_plot['en_label'],
    y=df_plot['hu_mean'],
    name='Hungarian',
    marker_color="#1f7211",
    hovertemplate='Hungarian: %{customdata[0]}<br>Mean: %{y:.2f}<br>p-value: %{customdata[1]:.4g}',
    customdata=df_plot[['hu', 'p_value']]
))

fig.add_trace(go.Bar(
    x=df_plot['en_label'],
    y=df_plot['zh_mean'],
    name='Chinese',
    marker_color="#e81818",
    hovertemplate='Chinese: %{customdata[0]}<br>Mean: %{y:.2f}<br>p-value: %{customdata[1]:.4g}',
    customdata=df_plot[['zh', 'p_value']]
))

fig.update_layout(
    barmode='group',
    title='Comparison of Gender Bias Ratings by Occupation (Hungarian vs Chinese)',
    xaxis_title='Occupation (Significant* and marginally significant+ differences in <b>bold</b>)',
    yaxis_title='Mean Rating',
    xaxis_tickangle=-45,
    template='plotly_white'
)

fig.show()

fig.write_html('occupations_comparison_ttest.html')
fig.write_image('occupations_comparison_ttest.png', scale=3, width=1000, height=500)

In [2282]:
# Add a new column 'bias' considering absolute values: 
# 'Hungarian' if abs(hu_mean) > abs(zh_mean), 'Chinese' if abs(hu_mean) < abs(zh_mean), 'Equal' if they are the same
df_compare['bias'] = df_compare.apply(
    lambda row: 'Hungarian' if abs(row['hu_mean']) > abs(row['zh_mean']) 
    else ('Chinese' if abs(row['hu_mean']) < abs(row['zh_mean']) else 'Equal'), axis=1
)

# Count values in 'bias' column and print the results
bias_counts = df_compare['bias'].value_counts()
print("Bias counts:")
print(bias_counts)

# Plot this
fig = px.bar(
    bias_counts,
    x=bias_counts.index,
    y=bias_counts.values,
    title='Count of Bias by Occupation',
    labels={'x': 'Bias', 'y': 'Count'},
    color=bias_counts.index,
    color_discrete_map={'Hungarian': '#1f7211', 'Chinese': '#e81818', 'Equal': '#7f7f7f'}
)
fig.update_layout(
    xaxis_title='Bias',
    yaxis_title='Count',
    template='plotly_white'
)
fig.show()

Bias counts:
bias
Chinese      28
Hungarian     9
Name: count, dtype: int64
