# Occupational Gender Bias

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import ttest_ind
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

In [2]:
print(px.colors.qualitative.Pastel)

# Custom colors
female_color = "#dcb0f2"
male_color = "#9eb9f3"
neutral_color = "#b3b3b3"

age_shades = ['#f6cf71', "#f6b671", "#f89c74"]

# Define colors for Hungarian and Chinese bars
hungarian_color = "#87c55f"  # greenish
chinese_color = "#c55f5f"  # reddish

hungarian_color_scale=["white", hungarian_color]
chinese_color_scale=["white", chinese_color]

['rgb(102, 197, 204)', 'rgb(246, 207, 113)', 'rgb(248, 156, 116)', 'rgb(220, 176, 242)', 'rgb(135, 197, 95)', 'rgb(158, 185, 243)', 'rgb(254, 136, 177)', 'rgb(201, 219, 116)', 'rgb(139, 224, 164)', 'rgb(180, 151, 231)', 'rgb(179, 179, 179)']


In [3]:
# Colors in RGB format for Plotly
colors_in_rgb = ['rgb(102, 197, 204)', 'rgb(246, 207, 113)', 'rgb(248, 156, 116)', 'rgb(220, 176, 242)', 'rgb(135, 197, 95)', 'rgb(158, 185, 243)', 'rgb(254, 136, 177)', 'rgb(201, 219, 116)', 'rgb(139, 224, 164)', 'rgb(180, 151, 231)', 'rgb(179, 179, 179)']

# Convert these into hex format for Plotly
def rgb_to_hex(rgb):
    return '#{:02x}{:02x}{:02x}'.format(*rgb)
colors_in_hex = [rgb_to_hex(tuple(map(int, color[4:-1].split(',')))) for color in colors_in_rgb]
colors_in_hex = ['#66c5cc', '#f6cf71','#f89c74','#dcb0f2','#87c55f','#9eb9f3','#fe88b1','#c9db74','#8be0a4','#b497e7','#b3b3b3']

## Occupations data

In [4]:
# Read files
occupations = pd.read_excel('occupations.xlsx')

# Remove r column
occupations = occupations.drop(columns=['r', 'att_hu', 'att_zh' ])
occupations.head()

Unnamed: 0,#,#_zh,zh,en,hu,chatgpt_hu,copilot_hu,deepseek_hu,gemini_hu,chatgpt_zh,copilot_zh,deepseek_zh,gemini_zh
0,1.0,,模特,model,modell,1.881,1.993,0.97,1.115,1.84,2.45,1.44,0.1
1,2.0,,军人,soldier,katona,-2.008889,-2.751,-2.95,-2.98,-2.0,-2.5,-2.36,-2.3
2,3.0,,法医,pathologist,kórboncnok,-1.074444,-1.336,-2.29,-0.63,-1.25,-0.75,-1.26,-0.4
3,4.0,,董事长,CEO,vezérigazgató,-1.287778,-2.541,-2.09,-1.77,-2.0,-2.25,-1.94,-1.6
4,5.0,,高管,manager,menedzser,-0.487778,-0.752,-1.18,-0.47,-1.25,-1.575,-1.66,-1.1


## Hungarian data

In [5]:
# Read Hungarian participant data from prolific_hu.csv
df_prolific_hu = pd.read_csv('prolific_hu.csv')

# Return Participant ID of rows where Status is RETURNED or REJECTED
rejects = df_prolific_hu[df_prolific_hu['Status'].isin(['RETURNED', 'REJECTED'])]['Participant id'].unique()

# create reject list
rejects_list = rejects.tolist()
print(f"Rejected participants: {rejects_list}")

df_prolific_hu.head()

Rejected participants: ['5e57a0020c3c6a14a1624031', '599494e7bf8bcf0001ab6973', '5f5a27482be30c0718bbf1e0']


Unnamed: 0,Submission id,Participant id,Status,Custom study tncs accepted at,Started at,Completed at,Reviewed at,Archived at,Time taken,Completion code,Total approvals,Age,Sex,Ethnicity simplified,Country of birth,Country of residence,Nationality,Language,Student status,Employment status
0,68708cf6b2a5c6baa48d6343,5ef60257cd680928de23ccae,APPROVED,Not Applicable,2025-07-11T04:03:02.613000Z,2025-07-11T04:07:42.625000Z,2025-07-15T04:08:31.855000Z,2025-07-11T04:07:43.031319Z,281.0,C1MO6037,377,29,Male,White,Hungary,Hungary,Hungary,Hungarian,Yes,Full-Time
1,68708d873d4d0b6d92e59937,5c48be0496d59b000183e68d,APPROVED,Not Applicable,2025-07-11T04:05:42.316000Z,2025-07-11T04:13:25.502000Z,2025-07-15T04:08:32.344000Z,2025-07-11T04:13:25.947923Z,464.0,C1MO6037,1457,48,Male,White,Hungary,Hungary,Hungary,Hungarian,No,Full-Time
2,68708f072c19330dab93cd7b,5d3449524e8363001735fc41,APPROVED,Not Applicable,2025-07-11T04:12:14.614000Z,2025-07-11T04:20:51.642000Z,2025-07-15T04:08:32.762000Z,2025-07-11T04:20:52.060603Z,518.0,C1MO6037,657,42,Male,White,Hungary,Hungary,Hungary,Hungarian,DATA_EXPIRED,Full-Time
3,68708f80de42cf279f3089d8,5a913d2cf0536100017196d8,APPROVED,Not Applicable,2025-07-11T04:13:57.716000Z,2025-07-11T04:16:47.135000Z,2025-07-15T04:08:33.160000Z,2025-07-11T04:16:47.592395Z,170.0,C1MO6037,29,27,Male,White,Hungary,Hungary,Hungary,Hungarian,No,Full-Time
4,68708fb5ab7d87dd294e6c74,5d3873197860c8001a106e02,APPROVED,Not Applicable,2025-07-11T04:16:49.056000Z,2025-07-11T04:21:24.387000Z,2025-07-15T04:08:33.533000Z,2025-07-11T04:21:24.787505Z,276.0,C1MO6037,2564,34,Male,White,Hungary,Hungary,Hungary,Hungarian,No,Full-Time


In [6]:
# Read Hungarian data from hu.xlsx
df_hu = pd.read_excel('ratings_hu_24.xlsx')

# Rename Prolific ID to Participant ID
df_hu.rename(columns={'Prolific ID': 'Participant ID'}, inplace=True)

# Print number of participants
print(f"Number of participants: {df_hu.shape[0]}")

# Define Hungarian attention checks
hu_attention_checks = ['pincérnő', 'titkárnő', 'tanárnő', 'takarítónő', 'ápolónő', 'házvezetőnő']

# Show wo failed attention any checks, find participants who did not mark all hu_attention_checks columns as 3
failed_attention = df_hu[df_hu[hu_attention_checks].apply(lambda row: any(row != 'Teljesen női'), axis=1)]

# Show the failed participants (if any)
failed = failed_attention[['Participant ID'] + hu_attention_checks] if not failed_attention.empty else print("All participants passed the attention checks.")
failed

Number of participants: 24


Unnamed: 0,Participant ID,pincérnő,titkárnő,tanárnő,takarítónő,ápolónő,házvezetőnő
2,5a913d2cf0536100017196d8,Teljesen női,Nagyrészt női,Teljesen női,Teljesen női,Teljesen női,Teljesen női
9,599494e7bf8bcf0001ab6973,Teljesen női,Teljesen férfi,Teljesen női,Teljesen női,Teljesen női,Teljesen női
12,5e9ab5df2893af141343bce7,Inkább női,Nagyrészt női,Inkább női,Inkább női,Nagyrészt női,Inkább női
14,S,Teljesen női,Teljesen női,Teljesen női,Nagyrészt női,Teljesen női,Teljesen női


In [7]:
# Remove those who failed miserably
df_hu = df_hu[df_hu['Participant ID'] != '5e9ab5df2893af141343bce7']
df_hu = df_hu[df_hu['Participant ID'] != '599494e7bf8bcf0001ab6973']

# Remove participants who were rejected
df_hu = df_hu[~df_hu['Participant ID'].isin(rejects_list)]

# Print number of participants
print(f"Number of participants: {df_hu.shape[0]}")

# Drop attention checks
df_hu.drop(columns=hu_attention_checks, inplace=True)

# Drop column nővér because it is not needed
# df_hu = df_hu.drop(columns=['nővér'])

Number of participants: 22


In [8]:
# Define the mapping for ratings
rating_map = {
    'Teljesen férfi': -3,
    'Nagyrészt férfi': -2,
    'Inkább férfi': -1,
    'Semleges/egyenlő': 0,
    'Inkább női': 1,
    'Nagyrészt női': 2,
    'Teljesen női': 3
}

# Get columns to convert (skip non-rating columns)
rating_columns = df_hu.columns[8:]  # assuming first 8 columns are not ratings

# Replace and explicitly infer objects to avoid warning
for col in rating_columns:
    df_hu[col] = df_hu[col].map(rating_map)
    
# Drop rows where Participant ID is in rejects_list
df_hu = df_hu[~df_hu['Participant ID'].isin(rejects_list)]

# Rename Életkor to Age and Nem to Gender
df_hu.rename(columns={'Életkor': 'Age', 'Nem':'Gender'}, inplace=True)

# Replace férfi to male and nő to female in Gender column
df_hu['Gender'] = df_hu['Gender'].replace({'férfi': 'male', 'nő': 'female'})

# Count and print participants based on unique Participant ID
num_participants = df_hu['Participant ID'].nunique()
print(f'Number of participants: {num_participants}')

# Count the number of columns starting with the 8th.
num_columns = len(df_hu.columns) - 8
print(f'Number of words: {num_columns}')

# Show
df_hu.head()


Number of participants: 22
Number of words: 44


Unnamed: 0,ID,Start time,Completion time,Email,Name,Participant ID,Age,Gender,modell,katona,...,dietetikus,tanár,rendőr,pilóta,recepciós,biztonsági őr,ügyész,kozmetikus,programozó,diák
0,1,2025-07-11 12:06:52,2025-07-11 12:07:43,anonymous,,5ef60257cd680928de23ccae,25-35,male,2,-3,...,0,0,-2,-1,0,-2,0,2,-1,0
1,2,2025-07-11 12:06:58,2025-07-11 12:13:33,anonymous,,5c48be0496d59b000183e68d,45-55,male,2,-2,...,0,0,-2,-2,1,-2,-1,3,-2,0
2,3,2025-07-11 12:14:03,2025-07-11 12:16:52,anonymous,,5a913d2cf0536100017196d8,25-35,male,0,-2,...,0,0,-1,0,0,-2,0,2,0,0
3,4,2025-07-11 12:12:24,2025-07-11 12:21:19,anonymous,,5d3449524e8363001735fc41,35-45,male,2,-2,...,1,2,-2,-2,2,-2,-1,3,-2,0
4,5,2025-07-11 12:16:52,2025-07-11 12:21:35,anonymous,,5d3873197860c8001a106e02,25-35,male,0,-2,...,0,0,-1,-1,0,-1,0,2,-2,0


### Demographics

In [9]:
color_map = {
    'Men': female_color,
    'Women': male_color,
    'Equal': neutral_color
}

# Three shades of teal for age groups
gender_colors = [female_color, male_color]

# Prepare gender and age counts and percentages
gender_counts = df_hu['Gender'].value_counts()
gender_labels = gender_counts.index
gender_labels_en = ['Male' if g == 'male' else 'Female' for g in gender_labels]
gender = gender_counts / gender_counts.sum() * 100

age_counts = df_hu['Age'].value_counts().sort_index()
age = age_counts / age_counts.sum() * 100

# Gender pie with custom colors
gender_pie = go.Pie(
    labels=gender_labels_en,
    values=gender.values,
    name='Gender',
    hole=0.33,
    title='Gender',
    marker=dict(colors=gender_colors),
    customdata=np.stack([gender_labels_en, gender_counts.values, gender.values], axis=-1),
    textinfo='text',
    textfont=dict(size=28),
    text=[f"{label}<br>n={int(n)}<br>{p:.1f}%" for label, n, p in zip(gender_labels_en, gender_counts.values, gender.values)],
)

# Age pie with custom teal shades (repeat if more age groups)
age_colors = (age_shades * ((len(age) // len(age_shades)) + 1))[:len(age)]
age_pie = go.Pie(
    labels=age.index,
    values=age.values,
    name='Age',
    hole=0.33,
    title='Age',
    marker=dict(colors=age_colors),
    customdata=np.stack([age.index, age_counts.values, age.values], axis=-1),
    textinfo='text',
    textfont=dict(size=28),
    text=[f"{label}<br>n={int(n)}<br>{p:.1f}%" for label, n, p in zip(age.index, age_counts.values, age.values)],
)

# Create subplot with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],
    subplot_titles=['Gender Distribution', 'Age Distribution']
)

fig.add_trace(gender_pie, 1, 1)
fig.add_trace(age_pie, 1, 2)

fig.update_layout(
    font = dict(family="Times New Roman, serif", size=32, color='black'),
    margin=dict(l=0, r=0, t=0, b=0),
    showlegend=False,
    legend=dict(
        font=dict(size=28),
        orientation='v',
        x=0.5,
        xanchor='center',
        y=1,
        yanchor='top'
    ),
    annotations=[
        dict(
            text=f'No. of participants: {df_hu.shape[0]}',
            x=0.5, y=0.9, xref='paper', yref='paper',
            showarrow=False, font=dict(size=28), align='center'
        )
    ]
)

fig.show()

# Save it as html
fig.write_html('demographics_hu.html')

# Save it as image
fig.write_image('demographics_hu.png', scale=3, width=1000, height=400)

### Analysis

#### Two-sample t-test by gender

In [10]:
# Add a new column with the mean of the ratings for each participant
df_hu['Mean Rating'] = df_hu.iloc[:, 8:].mean(axis=1)

# Separate df_hu into male and female datasets based on the Gender column
df_hu_male = df_hu[df_hu['Gender'] == 'male']
df_hu_female = df_hu[df_hu['Gender'] == 'female']

# Show the number of male and female participants
print(f"Number of male participants: {df_hu_male['Participant ID'].nunique()}")
print(f"Number of female participants: {df_hu_female['Participant ID'].nunique()}")

# Two Sample T-Test between male and female participants for each occupation word
results = []
for col in rating_columns:
    male_ratings = df_hu_male[col].dropna().astype(float)
    female_ratings = df_hu_female[col].dropna().astype(float)
    # Only test if both groups have at least 2 ratings
    if len(male_ratings) > 1 and len(female_ratings) > 1:
        t_stat, p_value = stats.ttest_ind(male_ratings, female_ratings, equal_var=False)
        mean_male = male_ratings.mean()
        mean_female = female_ratings.mean()
        # Get English translation from occupations DataFrame
        en_translation = occupations.loc[occupations['hu'] == col, 'en'].values
        en_translation = en_translation[0] if len(en_translation) > 0 else ""
        results.append({
            'occupation': col,
            'mean_male': mean_male,
            'mean_female': mean_female,
            't_stat': t_stat,
            'p_value': p_value,
            'significant': p_value < 0.05,
            'marg_significant': 0.05 < p_value < 0.1,
            'en': en_translation
        })

# Create a DataFrame from the results
df_ttest = pd.DataFrame(results)

# Add overall mean rating for sorting
df_ttest['overall_mean'] = (df_ttest['mean_male'] + df_ttest['mean_female']) / 2

# Sort by overall mean rating
df_ttest_sorted = df_ttest.sort_values('overall_mean', ascending=False)

# Prepare axis labels: bold for significant
def bold_label(row):
    occupation = row['occupation']
    if row.get('significant'):
        occupation = f"<b>{occupation}*</b>"
    if row.get('marg_significant'):
        occupation = f"<b>{occupation}+</b>"
    return occupation

df_ttest_sorted['occupation_label'] = df_ttest_sorted.apply(bold_label, axis=1)

# Plot with English translation in hover
fig = go.Figure()

fig.add_trace(go.Bar(
    x=df_ttest_sorted['occupation_label'],
    y=df_ttest_sorted['mean_male'],
    name='Male',
    marker_color=male_color,
    customdata=df_ttest_sorted[['en']],
    hovertemplate='Occupation: %{x}<br>English: %{customdata[0]}<br>Mean (Male): %{y:.2f}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=df_ttest_sorted['occupation_label'],
    y=df_ttest_sorted['mean_female'],
    name='Female',
    marker_color=female_color,
    customdata=df_ttest_sorted[['en']],
    hovertemplate='Occupation: %{x}<br>English: %{customdata[0]}<br>Mean (Female): %{y:.2f}<extra></extra>'
))

fig.update_layout(
    barmode='group',
    xaxis_tickangle=-45,
    yaxis_title='Bias (Male - Female)',
    yaxis=dict(
        range=[-3.1, 3.1],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        title='Mean Rating (Bias)'
    ),
    template='plotly_white',
    font=dict(family="Times New Roman, serif", size=16, color='black'),
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(220, 220, 220, 0.25)',
    )
)

# Show the plot
fig.show()

# Save it as HTML and image
fig.write_html('occupations_hu_gender.html')
fig.write_image('occupations_hu_gender.png', scale=3, width=1000, height=400)

Number of male participants: 11
Number of female participants: 11


#### Biases and biases by

In [11]:
# Add a column 'bias_by' to df_ttest_sorted: 'Male' if abs(mean_male) > abs(mean_female), 'Female' if abs(mean_female) > abs(mean_male), 'Equal' if they are the same
df_ttest_sorted['bias_by'] = np.where(
    np.abs(df_ttest_sorted['mean_male']) > np.abs(df_ttest_sorted['mean_female']), 'Men',
    np.where(np.abs(df_ttest_sorted['mean_male']) < np.abs(df_ttest_sorted['mean_female']), 'Women', 'Equal')
)

# Add columns that show if the bias is male or female. Add male, if the overall mean is lower than 0, othervise add Female
df_ttest_sorted['bias'] = np.where(
    df_ttest_sorted['overall_mean'] > 0, 'Female',
    np.where(df_ttest_sorted['overall_mean'] < 0, 'Male', 'Equal')
)

# Calculate mean_difference (mean_male - mean_female), and add its absolute value
df_ttest_sorted['mean_difference'] = (df_ttest_sorted['mean_male'] - df_ttest_sorted['mean_female']).abs()


In [12]:
# # Plot
# fig = go.Figure()
# fig.add_trace(go.Bar(
#     x=df_ttest_sorted['occupation'],
#     y=df_ttest_sorted['overall_mean'],
#     marker_color=df_ttest_sorted['bias_by'].map(color_map),
#     customdata=df_ttest_sorted[['mean_difference', 'bias_by']],
#     hovertemplate='Occupation: %{x}<br>Overall Mean: %{y:.2f}<br>Mean Diff: %{customdata[0]:.2f}<br>Greater Bias: %{customdata[1]}<extra></extra>'
# ))

# # Layout
# fig.update_layout(
#     title='Mean Ratings by Dominantly Biased Group, Sorted by Mean Rating Difference',
#     xaxis_title='Occupation (No significance)',
#     yaxis_title='Overall Mean Rating',
#     xaxis_tickangle=-45,
#     template='plotly_white'
# )

# fig.show()

#### Confusion matrix of biases

In [13]:
# Count unique values in 'bias' and 'bias_by' columns, ignoring 'Equal'
# Create confusion matrix, ignoring 'Equal'
conf_matrix_hu = pd.crosstab(
    df_ttest_sorted.loc[df_ttest_sorted['bias'] != 'Equal', 'bias'],
    df_ttest_sorted.loc[df_ttest_sorted['bias_by'] != 'Equal', 'bias_by']
)

# Plot confusion matrix with plotly
fig = go.Figure(data=go.Heatmap(
    z=conf_matrix_hu.values,
    x=conf_matrix_hu.columns,
    y=conf_matrix_hu.index,
    colorscale=hungarian_color_scale,
    text=conf_matrix_hu.values,
    texttemplate="%{text}",
    hovertemplate="Bias: %{y}<br>Bias By: %{x}<br>Count: %{z}<extra></extra>"
))

fig.update_layout(
    margin=dict(l=0, r=0, t=0, b=0),
    title="Confusion Matrix of Biases",
    xaxis_title="Bias By",
    yaxis_title="Bias",
    width=500,
    height=400
)

# Show
fig.show()

# Save confusion matrix as HTML and image
# fig.write_html('confusion_matrix_hu.html')
# fig.write_image('confusion_matrix_hu.png', scale=3, width=550, height=400)

#### Transpose

In [14]:
# Transpose results, a prepare a copy with only the ratings
df_hu_ratings = df_hu[rating_columns].transpose()
df_hu = df_hu.transpose()

# Remove rows with index: ID, Start Time, Completion time, Email, Name, Participant ID
df_hu = df_hu.drop(['ID', 'Start time', 'Completion time', 'Email', 'Name', 'Participant ID'], axis=0) 

# Print number of rows
print(f"Number of rows: {len(df_hu_ratings)}")

# Sort by index
df_hu = df_hu.sort_index()

# Show
df_hu.head()

Number of rows: 44


Unnamed: 0,0,1,2,3,4,5,6,7,8,10,...,14,15,16,17,18,19,20,21,22,23
Age,25-35,45-55,25-35,35-45,25-35,25-35,35-45,45-55,25-35,45-55,...,35-45,25-35,45-55,45-55,25-35,25-35,25-35,25-35,25-35,25-35
Gender,male,male,male,male,male,male,female,female,male,female,...,female,female,female,female,female,female,female,male,male,male
HR-es,0,1,0,2,0,1,1,1,2,0,...,2,1,0,0,2,1,1,1,1,2
Mean Rating,-0.204545,-0.25,-0.204545,0.272727,-0.204545,-0.136364,-0.340909,-0.204545,-0.181818,-0.113636,...,-0.068182,-0.25,-0.090909,-0.090909,-0.25,-0.386364,-0.25,-0.272727,-0.272727,-0.409091
PR munkatárs,0,0,0,1,0,0,1,-1,1,0,...,0,0,0,0,0,0,0,2,0,-1


#### One Sample T-test

In [15]:
# One Sample T-test with standard deviation
results = []

for index, row in df_hu_ratings.iterrows():
    ratings = row.dropna().astype(float)
    t_stat, p_value = stats.ttest_1samp(ratings, popmean=0)
    mean_rating = ratings.mean()
    std_rating = ratings.std()
    results.append({
        'item': index,  # item name from index
        'mean': mean_rating,
        'std': std_rating,
        't_stat': t_stat,
        'p_value': p_value,
        'significant': p_value < 0.05
    })

df_results = pd.DataFrame(results)

# Print significant results
print(df_results[df_results['significant']][['item', 'mean', 'std', 'p_value']])

# Filter for non-significant ratings
not_significant = df_results[~df_results['significant']]

# Get the min and max of the mean ratings where not significant
mean_min = not_significant['mean'].min()
mean_max = not_significant['mean'].max()

# Print the range of mean ratings where the rating is not significant
print(f"\nRange of mean ratings where the rating is not significant: {mean_min} to {mean_max}")

               item      mean       std       p_value
0            modell  1.045455  0.898532  2.052674e-05
1            katona -1.863636  0.710161  4.551072e-11
2        kórboncnok -0.772727  0.812510  2.159344e-04
3     vezérigazgató -1.272727  0.702500  3.087969e-08
4         menedzser -0.681818  0.716231  2.137300e-04
5             nővér  2.227273  0.812510  2.018214e-11
6           szakács -0.863636  0.710161  1.160885e-05
8          könyvelő  0.500000  0.859125  1.255207e-02
9        professzor -0.909091  0.867898  7.361950e-05
10          építész -1.181818  0.852803  1.934116e-06
11            tudós -0.409091  0.590326  3.827670e-03
13        pénztáros  1.045455  0.843873  9.089139e-06
15           munkás -1.545455  0.911685  9.086047e-08
16        vízimentő -1.227273  0.922307  3.434039e-06
18          tűzoltó -2.272727  0.767297  4.674128e-12
19           mérnök -1.181818  0.795006  6.928206e-07
20          rendező -0.863636  0.710161  1.160885e-05
21         takarító  1.13636

In [16]:
# Sort by mean for better readability
df_results_sorted = df_results.sort_values(by='mean')

# Create color labels
df_results_sorted['Significance'] = df_results_sorted['significant'].map({True: 'significant', False: 'not significant'})

# Plot
fig = px.bar(
    df_results_sorted,
    x='item',
    y='mean',
    color='Significance',
    color_discrete_map={'significant': 'crimson', 'not significant': 'lightgray'},
    title='One Sample T-test of Raw Ratings',
    labels={'item': 'Item', 'mean': 'Mean Rating'},
    # On hover, Show the item, mean rating, and p-value    
    hover_data=['item', 'mean', 'p_value'],
    # Edit hovertemplate to show item, mean, and p-value
    
)

fig.update_layout(
    xaxis_tickangle=-45,
    template='plotly_white',
    margin=dict(l=0, r=0, t=40, b=0),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(220, 220, 220, 0.25)',
    )
)

# Show
fig.show()

# # Save it as html
# fig.write_html('occupations_ttest_hu.html')
# fig.write_image('occupations_ttest_hu.png', scale=3, width=1000, height=400)

#### Merge

In [17]:
# Turn index column into a column called 'hu'
df_hu.reset_index(inplace=True)
df_hu.rename(columns={'index': 'hu'}, inplace=True)

# Merge df_hu and occupations on the 'hu' column
df_hu = pd.merge(df_hu, occupations, on='hu', how='left')

# Reorder columns so the dataframe starts with 'hu' 'en', 'zh', and so on
df_hu = df_hu[['#', 'hu', 'en', 'zh'] + [col for col in df_hu.columns if col not in ['#', 'hu', 'en' , 'zh']]]

# Merge df_hu and df_results on the 'hu' column
df_hu = pd.merge(df_hu, df_results, left_on='hu', right_on='item', how='left')

# Sort all occupations by their average ratings
df_hu = df_hu.sort_values(by='mean', ascending=False)

# Drop redundant columns
df_hu.drop(columns=['#', 'item', 't_stat'], inplace=True)

# Rename the columns for clarity
df_hu.rename(columns={'mean': 'mean_hu',
                      't_stat': 't_stat_hu',
                      'std': 'std_hu',
                      'p_value': 'p_value_hu',
                      'significant': 'significant_hu'}, inplace=True)


# Save df_hu as an Excel file
# df_hu.to_excel('occupations_hu.xlsx', index=False)

# Show the final DataFrame
df_hu.head()

#Show
df_hu.tail()

Unnamed: 0,hu,en,zh,0,1,2,3,4,5,6,...,deepseek_hu,gemini_hu,chatgpt_zh,copilot_zh,deepseek_zh,gemini_zh,mean_hu,std_hu,p_value_hu,significant_hu
5,biztonsági őr,security guard,保安,-2,-2,-2,-2,-1,-2,-2,...,-2.7,-2.132,-2.25,-2.1,-2.12,-2.1,-1.954545,0.485727,1.191073e-14,True
41,tűzoltó,firefighter,消防员,-2,-3,0,-2,-2,-3,-2,...,-2.95,-2.985,-2.25,-2.525,-2.58,-2.5,-2.272727,0.767297,4.674128e-12,True
0,Age,,,25-35,45-55,25-35,35-45,25-35,25-35,35-45,...,,,,,,,,,,
1,Gender,,,male,male,male,male,male,male,female,...,,,,,,,,,,
3,Mean Rating,,,-0.204545,-0.25,-0.204545,0.272727,-0.204545,-0.136364,-0.340909,...,,,,,,,,,,


In [18]:
# Show only rating rows for teacher
df_hu[df_hu['en'] == 'student'].iloc[:, 4:-12]

Unnamed: 0,1,2,3,4,5,6,7,8,10,11,...,15,16,17,18,19,20,21,22,23,#_zh
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [19]:
# Get the top 5 words with the smallest standard deviation from df_hu
top5_smallest_std = df_hu.nsmallest(5, 'std_hu')[['hu', 'en', 'std_hu']]
print(top5_smallest_std)

               hu              en    std_hu
9            diák         student  0.000000
5   biztonsági őr  security guard  0.485727
7            bíró           judge  0.501081
10    felszolgáló          server  0.588490
40          tudós       scientist  0.590326


### Plot

In [20]:
# Make an explicit copy of the filtered DataFrame
df_hu_plot = df_hu[~df_hu['hu'].isin(['Age', 'Gender', 'Mean Rating'])].copy()

# Assign the 'bias' column safely
df_hu_plot.loc[:, 'bias'] = df_hu_plot['mean_hu'].apply(
    lambda x: 'Female' if x > mean_max else ('Male' if x < mean_min else 'Neutral')
)

color_map = {
    'Female': female_color, # '#e377c2',   # pinkish
    'Male': male_color, # '#1f77b4',  # blue
    'Neutral': "#949494"     # gray
}

# Sort df_hu_plot by mean value before plotting
df_hu_plot_sorted = df_hu_plot.sort_values('mean_hu', ascending=False)

df_hu_plot_sorted['label_hu'] = df_hu_plot_sorted.apply(
    lambda row: f"{row['hu']} ({row['en']})" if pd.notna(row['en']) else row['hu'], axis=1
)

fig = go.Figure()

for bias in ['Female', 'Neutral', 'Male']:
    subset = df_hu_plot_sorted[df_hu_plot_sorted['bias'] == bias]
    fig.add_trace(go.Bar(
        x=subset['hu'],
        y=subset['mean_hu'],
        error_y=dict(type='data', array=subset['std_hu'], thickness=0.75),
        name=bias,
        marker_color=color_map[bias],
        hovertemplate=(
            'Hungarian: %{x}<br>'
            'Mean Rating: %{y:.2f}<br>'
            'Standard Deviation: %{error_y.array:.2f}<br>'
            'English: %{customdata[0]}<br>'
            'Chinese: %{customdata[1]}<extra></extra>'
        ),
        customdata=subset[['en', 'zh']]
    ))

fig.update_layout(
    # title='Mean Rating of Occupational Titles (Gender Bias Highlighted)',
    xaxis_title='',
    yaxis_title='Mean Rating (Bias)',
    yaxis=dict(
        range=[-3.1, 3.1],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        title='Mean Rating (Bias)'
    ),
    xaxis_tickangle=-45,
    template='plotly_white',
    font = dict(family="Times New Roman, serif", size=16, color='black'),
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(220, 220, 220, 0.25)',
    )
)

fig.show()

# Save this as a html and image
fig.write_html('occupations_hu.html')
fig.write_image('occupations_hu.png', scale=3, width=1000, height=400)


In [21]:
# Plot the Hungarian data with LLM reference lines using plotly.graph_objects (go)
fig = go.Figure()

# Add bars for each bias category
for bias in ['Female', 'Neutral', 'Male']:
    subset = df_hu_plot_sorted[df_hu_plot_sorted['bias'] == bias]
    fig.add_trace(go.Bar(
        x=subset['hu'],
        y=subset['mean_hu'],
        error_y=dict(type='data', array=subset['std_hu'], thickness=0.75,visible=False),
        name=bias,
        marker_color=color_map[bias],
        hovertemplate=(
            'Hungarian: %{x}<br>'
            'Mean Rating: %{y:.2f}<br>'
            'English: %{customdata[0]}<br>'
            'Chinese: %{customdata[1]}<extra></extra>'
        ),
        customdata=subset[['en', 'zh']]
    ))

# Add Copilot reference line
fig.add_trace(go.Scatter(
    x=df_hu_plot_sorted['hu'],
    y=df_hu_plot_sorted['copilot_hu'],
    mode='lines+markers',
    name='Copilot',
    line=dict(color="#F2AD22", width=2, dash='dot'),
    marker=dict(symbol='pentagon', size=8),
    hovertemplate='Hungarian: %{x}<br>English: %{customdata[0]}<br>Copilot: %{y:.2f}<extra></extra>',
    customdata=df_hu_plot_sorted[['en']]
))

# Add ChatGPT reference line
fig.add_trace(go.Scatter(
    x=df_hu_plot_sorted['hu'],
    y=df_hu_plot_sorted['chatgpt_hu'],
    mode='lines+markers',
    name='ChatGPT',
    line=dict(color='#00A67E', width=2, dash='dot'),
    marker=dict(symbol='hexagon', size=8),
    hovertemplate='Hungarian: %{x}<br>English: %{customdata[0]}<br>ChatGPT: %{y:.2f}<extra></extra>',
    customdata=df_hu_plot_sorted[['en']]
))

# Add Deepseek reference line
fig.add_trace(go.Scatter(
    x=df_hu_plot_sorted['hu'],
    y=df_hu_plot_sorted['deepseek_hu'],
    mode='lines+markers',
    name='Deepseek',
    line=dict(color='#3C5DFF', width=2, dash='dot'),
    marker=dict(symbol='circle', size=8),
    hovertemplate='Hungarian: %{x}<br>English: %{customdata[0]}<br>Deepseek: %{y:.2f}<extra></extra>',
    customdata=df_hu_plot_sorted[['en']]
))

# Add Gemini reference line
fig.add_trace(go.Scatter(
    x=df_hu_plot_sorted['hu'],
    y=df_hu_plot_sorted['gemini_hu'],
    mode='lines+markers',
    name='Gemini',
    line=dict(color='#9177C7', width=2, dash='dot'),
    marker=dict(symbol='star-diamond', size=8),
    hovertemplate='Hungarian: %{x}<br>English: %{customdata[0]}<br>Gemini: %{y:.2f}<extra></extra>',
    customdata=df_hu_plot_sorted[['en']]
))

fig.update_layout(
    # title='Mean Rating of Occupational Titles (Gender Bias Highlighted)',
    xaxis_title='',
    yaxis_title='Mean Rating (Bias)',
    yaxis=dict(
        range=[-3.1, 3.1],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        title='Mean Rating (Bias)'
    ),
    xaxis_tickangle=-45,
    template='plotly_white',
    font = dict(family="Times New Roman, serif", size=16, color='black'),
    margin=dict(l=0, r=0, t=20, b=0),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0.95,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(220, 220, 220, 0.25)',
    )
)

fig.show()

# Save as HTML and image
fig.write_html('occupations_hu_with_ai.html')
fig.write_image('occupations_hu_with_ai.png', scale=3, width=1000, height=400)

## Chinese data

In [22]:
# Read Chinese data from hu.xlsx
df_zh = pd.read_excel('ratings_zh.xlsx')

# Rename ID to Participant ID
df_zh.rename(columns={'ID': 'Participant ID'}, inplace=True)

# Print number of participants
print(f"Number of participants: {df_zh.shape[0]}")

# Define Chinese attention checks #######################################
zh_attention_checks = ['妈妈', '女画家', '女作家', '爸爸', '男画家', '男作家'] # '男演员'

# # Show wo failed attention any checks, find participants who did not mark all hu_attention_checks columns as 3
# failed_attention = df_zh[df_zh[zh_attention_checks].apply(lambda row: any(row != 3), axis=1)]

# # Show the failed participants (if any)
# failed = failed_attention[['Participant ID'] + zh_attention_checks] if not failed_attention.empty else print("All participants passed the attention checks.")
# failed

# Drop attention checks
df_zh.drop(columns=zh_attention_checks, inplace=True)
df_zh.head()

Number of participants: 24


Unnamed: 0,Participant ID,您的年龄是？,您的性别是？,您的学历是？,警察,秘书,教授,护士,高管,教师,...,农民,学生,园丁,飞行员,人事,消防员,科学家,检察官,救生员,建筑师
0,24,小于25岁,男,硕士,1,-1,0,-1,0,0,...,1,0,1,1,0,2,0,1,2,1
1,1,小于25岁,女,硕士,0,-1,0,-2,0,-1,...,0,0,0,0,-2,1,0,0,1,0
2,2,小于25岁,女,本科,2,-2,2,-2,2,0,...,1,1,2,2,-2,2,2,2,2,2
3,3,小于25岁,女,硕士,1,-1,1,-2,1,0,...,0,0,0,1,0,2,1,1,1,2
4,4,小于25岁,男,本科,1,-1,0,-2,1,0,...,1,0,1,1,1,2,1,1,2,0


In [23]:
# Define the mapping for ratings
rating_map = {
    3: -3,
    2: -2,
    1: -1,
    0: 0,
    -1: 1,
    -2: 2,
    -3: 3
}

# Get columns to convert (skip non-rating columns)
rating_columns = df_zh.columns[4:]  # assuming first 8 columns are not ratings

# Replace and explicitly infer objects to avoid warning
for col in rating_columns:
    df_zh[col] = df_zh[col].map(rating_map)

# Rename 您的年龄是？ to Age and 您的性别是？ to Gender
df_zh.rename(columns={'您的年龄是？': 'Age', '您的性别是？':'Gender', '您的学历是？':'Education'}, inplace=True)

# Replace férfi to male and nő to female in Gender column
df_zh['Gender'] = df_zh['Gender'].replace({'男': 'male', '女': 'female'})

# In the Age column, change 小于25岁 to <25, 25-35岁 to 25-35, 36-45岁
df_zh['Age'] = df_zh['Age'].replace({'小于25岁': '<25', '25-35岁': '25-35', '35-45岁': '35-45',})

# Count and print participants based on unique Participant ID
num_participants = df_zh['Participant ID'].nunique()
print(f'Number of participants: {num_participants}')

# Count the number of columns starting with the 8th.
num_columns = len(df_zh.columns)
print(f'Number of words: {num_columns}')

# Show
df_zh.head()

Number of participants: 24
Number of words: 48


Unnamed: 0,Participant ID,Age,Gender,Education,警察,秘书,教授,护士,高管,教师,...,农民,学生,园丁,飞行员,人事,消防员,科学家,检察官,救生员,建筑师
0,24,<25,male,硕士,-1,1,0,1,0,0,...,-1,0,-1,-1,0,-2,0,-1,-2,-1
1,1,<25,female,硕士,0,1,0,2,0,1,...,0,0,0,0,2,-1,0,0,-1,0
2,2,<25,female,本科,-2,2,-2,2,-2,0,...,-1,-1,-2,-2,2,-2,-2,-2,-2,-2
3,3,<25,female,硕士,-1,1,-1,2,-1,0,...,0,0,0,-1,0,-2,-1,-1,-1,-2
4,4,<25,male,本科,-1,1,0,2,-1,0,...,-1,0,-1,-1,-1,-2,-1,-1,-2,0


### Demographics

In [24]:
# # Print unique value counts of Age, Gender, and Education
# print("Unique values in Age:")
# print(df_zh['Age'].value_counts())
# print("\nUnique values in Gender:")
# print(df_zh['Gender'].value_counts())
# print("\nUnique values in Education:")
# print(df_zh['Education'].value_counts())

# Sawp colors
gender_colors = gender_colors[::-1]

# Prepare gender and age counts and percentages
gender_counts = df_zh['Gender'].value_counts()
gender_labels = gender_counts.index
gender_labels_en = ['Male' if g == 'male' else 'Female' for g in gender_labels]
gender = gender_counts / gender_counts.sum() * 100

age_counts = df_zh['Age'].value_counts().sort_index()
age = age_counts / age_counts.sum() * 100

# Gender pie with custom colors
gender_pie = go.Pie(
    labels=gender_labels_en,
    values=gender.values,
    name='Gender',
    hole=0.33,
    title='Gender',
    marker=dict(colors=gender_colors),
    customdata=np.stack([gender_labels_en, gender_counts.values, gender.values], axis=-1),
    textinfo='text',
    textfont=dict(size=28),
    text=[f"{label}<br>n={int(n)}<br>{p:.1f}%" for label, n, p in zip(gender_labels_en, gender_counts.values, gender.values)],
)

# Age pie with custom teal shades (repeat if more age groups)
age_colors = (age_shades * ((len(age) // len(age_shades)) + 1))[:len(age)]
age_pie = go.Pie(
    labels=age.index,
    values=age.values,
    name='Age',
    hole=0.33,
    title='Age',
    marker=dict(colors=age_colors),
    customdata=np.stack([age.index, age_counts.values, age.values], axis=-1),
    textinfo='text',
    textfont=dict(size=28),
    text=[f"{label}<br>n={int(n)}<br>{p:.1f}%" for label, n, p in zip(age.index, age_counts.values, age.values)],
)

# Create subplot with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],
    subplot_titles=['Gender Distribution', 'Age Distribution']
)

fig.add_trace(gender_pie, 1, 1)
fig.add_trace(age_pie, 1, 2)

fig.update_layout(
    font = dict(family="Times New Roman, serif", size=32, color='black'),
    margin=dict(l=0, r=0, t=0, b=0),
    showlegend=False,
    legend=dict(
        font=dict(size=28),
        orientation='v',
        x=0.5,
        xanchor='center',
        y=1,
        yanchor='top'
    ),
    annotations=[
        dict(
            text=f'No. of participants: {df_zh.shape[0]}',
            x=0.5, y=0.9, xref='paper', yref='paper',
            showarrow=False, font=dict(size=28), align='center'
        )
    ]
)

fig.show()

# Save it as html
fig.write_html('demographics_zh.html')

# Save it as image
fig.write_image('demographics_zh.png', scale=3, width=1000, height=400)

### Analysis

#### Two Sample T-Test by Gender

In [25]:
# Add a new column with the mean of the ratings for each participant
df_zh['Mean Rating'] = df_zh.iloc[:, 8:].mean(axis=1)

# Separate df_zh into male and female datasets based on the Gender column
df_zh_male = df_zh[df_zh['Gender'] == 'male']
df_zh_female = df_zh[df_zh['Gender'] == 'female']

# Show the number of male and female participants
print(f"Number of male participants: {df_zh_male['Participant ID'].nunique()}")
print(f"Number of female participants: {df_zh_female['Participant ID'].nunique()}")

# Two Sample T-Test between male and female participants for each occupation word
results = []
for col in rating_columns:
    male_ratings = df_zh_male[col].dropna().astype(float)
    female_ratings = df_zh_female[col].dropna().astype(float)
    # Only test if both groups have at least 2 ratings
    if len(male_ratings) > 1 and len(female_ratings) > 1:
        t_stat, p_value = stats.ttest_ind(male_ratings, female_ratings, equal_var=False)
        mean_male = male_ratings.mean()
        mean_female = female_ratings.mean()
        # Get English translation from occupations DataFrame
        en_translation = occupations.loc[occupations['zh'] == col, 'en'].values
        en_translation = en_translation[0] if len(en_translation) > 0 else ""
        results.append({
            'occupation': col,
            'mean_male': mean_male,
            'mean_female': mean_female,
            't_stat': t_stat,
            'p_value': p_value,
            'significant': p_value < 0.05,
            'marg_significant': 0.05 < p_value < 0.1,
            'en': en_translation
        })

# Create a DataFrame from the results
df_ttest = pd.DataFrame(results)

# Add overall mean rating for sorting
df_ttest['overall_mean'] = (df_ttest['mean_male'] + df_ttest['mean_female']) / 2

# Sort by overall mean rating
df_ttest_sorted = df_ttest.sort_values('overall_mean', ascending=False)

# Prepare axis labels: bold for significant
def bold_label(row):
    occupation = row['occupation']
    if row.get('significant'):
        occupation = f"<b>{occupation}*</b>"
    if row.get('marg_significant'):
        occupation = f"<b>{occupation}+</b>"
    return occupation

df_ttest_sorted['occupation_label'] = df_ttest_sorted.apply(bold_label, axis=1)

# Plot with English translation in hover
fig = go.Figure()

fig.add_trace(go.Bar(
    x=df_ttest_sorted['occupation_label'],
    y=df_ttest_sorted['mean_male'],
    name='Male',
    marker_color=male_color,
    customdata=df_ttest_sorted[['en']],
    hovertemplate='Occupation: %{x}<br>English: %{customdata[0]}<br>Mean (Male): %{y:.2f}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=df_ttest_sorted['occupation_label'],
    y=df_ttest_sorted['mean_female'],
    name='Female',
    marker_color=female_color,
    customdata=df_ttest_sorted[['en']],
    hovertemplate='Occupation: %{x}<br>English: %{customdata[0]}<br>Mean (Female): %{y:.2f}<extra></extra>'
))

fig.update_layout(
    barmode='group',
    xaxis_tickangle=-45,
    yaxis_title='Bias (Male - Female)',
    yaxis=dict(
        range=[-3.1, 3.1],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        title='Mean Rating (Bias)'
    ),
    template='plotly_white',
    font=dict(family="Times New Roman, serif", size=16, color='black'),
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(220, 220, 220, 0.25)',
    )
)

# Show the plot
fig.show()

# Save it as HTML and image
fig.write_html('occupations_zh_gender.html')
fig.write_image('occupations_zh_gender.png', scale=3, width=1000, height=400)

Number of male participants: 14
Number of female participants: 10


#### Biases and biases by

In [26]:
# Add a column 'bias_by' to df_ttest_sorted: 'Male' if abs(mean_male) > abs(mean_female), 'Female' if abs(mean_female) > abs(mean_male), 'Equal' if they are the same
df_ttest_sorted['bias_by'] = np.where(
    np.abs(df_ttest_sorted['mean_male']) > np.abs(df_ttest_sorted['mean_female']), 'Men',
    np.where(np.abs(df_ttest_sorted['mean_male']) < np.abs(df_ttest_sorted['mean_female']), 'Women', 'Equal')
)

# Add columns that show if the bias is male or female. Add male, if the overall mean is lower than 0, othervise add Female
df_ttest_sorted['bias'] = np.where(
    df_ttest_sorted['overall_mean'] > 0, 'Female',
    np.where(df_ttest_sorted['overall_mean'] < 0, 'Male', 'Equal')
)

# Calculate mean_difference (mean_male - mean_female), and add its absolute value
df_ttest_sorted['mean_difference'] = (df_ttest_sorted['mean_male'] - df_ttest_sorted['mean_female']).abs()


#### Confusion matrix of biases

In [27]:
# Count unique values in 'bias' and 'bias_by' columns, ignoring 'Equal'
# Create confusion matrix, ignoring 'Equal'
conf_matrix_zh = pd.crosstab(
    df_ttest_sorted.loc[df_ttest_sorted['bias'] != 'Equal', 'bias'],
    df_ttest_sorted.loc[df_ttest_sorted['bias_by'] != 'Equal', 'bias_by']
)

# Plot confusion matrix with plotly
fig = go.Figure(data=go.Heatmap(
    z=conf_matrix_zh.values,
    x=conf_matrix_zh.columns,
    y=conf_matrix_zh.index,
    colorscale=chinese_color_scale,
    text=conf_matrix_zh.values,
    texttemplate="%{text}",
    hovertemplate="Bias: %{y}<br>Bias By: %{x}<br>Count: %{z}<extra></extra>"
))

fig.update_layout(
    font=dict(family="Times New Roman, serif", size=28, color='black'),
    margin=dict(l=0, r=0, t=0, b=0),
    title="Confusion Matrix of Biases",
    xaxis_title="Bias By",
    yaxis_title="Bias",
    width=500,
    height=400
)

# Show
fig.show()

# # Save confusion matrix as HTML and image
# fig.write_html('confusion_matrix_zh.html')
# fig.write_image('confusion_matrix_zh.png', scale=3, width=550, height=400)

#### Combined confusion matrices

In [28]:
from plotly.subplots import make_subplots

import plotly.graph_objects as go

# Create subplots
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=["Hungarian", "Chinese"],
    horizontal_spacing=0.40
)
fig.update_annotations(font_size=32)

# Hungarian confusion matrix heatmap (greenish)
fig.add_trace(
    go.Heatmap(
        z=conf_matrix_hu.values,
        x=conf_matrix_hu.columns,
        y=conf_matrix_hu.index,
        colorscale=hungarian_color_scale,
        text=conf_matrix_hu.values,
        texttemplate="%{text}",
        hovertemplate="Bias: %{y}<br>Bias By: %{x}<br>Count: %{z}<extra></extra>",
        # yaxis =dict(
            # title_text="Bias",
            # side="right",
            # title_standoff=10,
        # )    
    ),
    row=1, col=1
)

# Chinese confusion matrix heatmap (reddish)
fig.add_trace(
    go.Heatmap(
        z=conf_matrix_zh.values,
        x=conf_matrix_zh.columns,
        y=conf_matrix_zh.index,
        colorscale=chinese_color_scale,
        text=conf_matrix_zh.values,
        texttemplate="%{text}",
        hovertemplate="Bias: %{y}<br>Bias By: %{x}<br>Count: %{z}<extra></extra>"
    ),
    row=1, col=2
)

fig.update_traces(showscale=False)

fig.update_layout(
    font=dict(family="Times New Roman, serif", size=32, color='black'),
    margin=dict(l=0, r=0, t=40, b=0),
    width=1000,
    height=400,
    # showlegend=False
)

fig.update_xaxes(title_text="Bias By", row=1, col=1)
fig.update_xaxes(title_text="Bias By", row=1, col=2)
fig.update_yaxes(title_text="Bias", row=1, col=1, side="right")
fig.update_yaxes(title_text="Bias", row=1, col=2)

# fig.update_yaxes(
#     showticklabels=True,
#     title_text="Bias",
#     title_standoff=10,
#     side="right",
#     row=1, col=2
# )

fig.show()

# Save the combined confusion matrix as HTML and image
fig.write_html('confusion_matrices.html')
fig.write_image('confusion_matrices.png', scale=3, width=1000, height=400)

#### Transpose

In [29]:
# Transpose results, a prepare a copy with only the ratings
df_zh_ratings = df_zh[rating_columns].transpose()
df_zh = df_zh.transpose()

# Remove rows with index: ID, Start Time, Completion time, Email, Name, Participant ID
df_zh = df_zh.drop(['Participant ID', 'Education'], axis=0) 

# Print number of rows
print(f"Number of rows: {len(df_zh_ratings)}")

# # Sort by index
# df_zh = df_zh.sort_index()

# Show
df_zh.head()

Number of rows: 44


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
Age,<25,<25,<25,<25,<25,<25,<25,<25,35-45,<25,...,<25,<25,<25,25-35,<25,25-35,25-35,25-35,25-35,<25
Gender,male,female,female,female,male,female,female,male,male,male,...,male,male,male,female,male,male,male,male,male,female
警察,-1,0,-2,-1,-1,-1,0,-2,-1,-2,...,-1,-1,-1,-1,-2,-1,-1,-2,0,-1
秘书,1,1,2,1,1,0,0,2,1,0,...,1,1,1,1,1,1,1,2,1,1
教授,0,0,-2,-1,0,-1,-1,-2,0,-1,...,-1,0,0,0,0,0,0,-1,0,0


#### One Sample T-test

In [30]:
# One Sample T-test with standard deviation
results = []

for index, row in df_zh_ratings.iterrows():
    ratings = row.dropna().astype(float)
    t_stat, p_value = stats.ttest_1samp(ratings, popmean=0)
    mean_rating = ratings.mean()
    std_rating = ratings.std()
    results.append({
        'item': index,  # item name from index
        'mean': mean_rating,
        'std': std_rating,
        't_stat': t_stat,
        'p_value': p_value,
        'significant': p_value < 0.05
    })

df_results = pd.DataFrame(results)

# Print significant results
print(df_results[df_results['significant']][['item', 'mean', 'std', 'p_value']])

# Filter for non-significant ratings
not_significant = df_results[~df_results['significant']]

# Get the min and max of the mean ratings where not significant
mean_min = not_significant['mean'].min()
mean_max = not_significant['mean'].max()

# Print the range of mean ratings where the rating is not significant
print(f"\nRange of mean ratings where the rating is not significant: {mean_min} to {mean_max}")

   item      mean       std       p_value
0    警察 -1.125000  0.612372  5.367933e-09
1    秘书  0.958333  0.550033  1.386222e-08
2    教授 -0.458333  0.658005  2.385824e-03
3    护士  1.583333  0.503610  1.312409e-13
4    高管 -0.500000  0.659380  1.138944e-03
5    教师  0.541667  0.658005  5.185043e-04
6    前台  1.375000  1.013496  8.825420e-07
7    工人 -1.125000  0.797414  4.790048e-07
8    幼师  1.583333  0.717282  1.711575e-10
9    模特  0.333333  0.564660  8.223215e-03
10   护工  0.625000  0.769670  5.938987e-04
11   保姆  1.625000  0.710939  8.681772e-11
12   会计  0.500000  0.722315  2.511711e-03
13  工程师 -0.875000  0.899879  8.393129e-05
14   保洁  0.708333  0.858673  5.077495e-04
15   法官 -0.375000  0.646899  9.277574e-03
16  导购员  0.833333  0.637022  1.537714e-06
17  美容师  1.458333  0.779028  3.813259e-09
19  乘务员  0.458333  0.658005  2.385824e-03
20  理发师 -0.750000  0.737210  4.853396e-05
21  空服员  0.708333  0.806450  2.646259e-04
22  售票员  0.333333  0.481543  2.511711e-03
23   厨师 -1.041667  0.750604  6.206

In [31]:
# Sort by mean for better readability
df_results_sorted = df_results.sort_values(by='mean')

# Create color labels
df_results_sorted['Significance'] = df_results_sorted['significant'].map({True: 'significant', False: 'not significant'})

# Plot
fig = px.bar(
    df_results_sorted,
    x='item',
    y='mean',
    color='Significance',
    color_discrete_map={'significant': 'crimson', 'not significant': 'lightgray'},
    title='One Sample T-test of Raw Ratings',
    labels={'item': 'Item', 'mean': 'Mean Rating'},
    # On hover, Show the item, mean rating, and p-value    
    hover_data=['item', 'mean', 'p_value'],
    # Edit hovertemplate to show item, mean, and p-value
    
)

fig.update_layout(
    xaxis_tickangle=-45,
    template='plotly_white',
    margin=dict(l=0, r=0, t=40, b=0),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(220, 220, 220, 0.25)',
    )
)

# Show
fig.show()

# # Save it as html
# fig.write_html('occupations_ttest_zh.html')
# fig.write_image('occupations_ttest_zh.png', scale=3, width=1000, height=400)

#### Merge

In [32]:
# Turn index column into a column called 'zh'
df_zh.reset_index(inplace=True)
df_zh.rename(columns={'index': 'zh'}, inplace=True)

# Merge df_zh and occupations on the 'zh' column
df_zh = pd.merge(df_zh, occupations, on='zh', how='left')

# Reorder columns so the dataframe starts with 'zh' 'en', 'zh', and so on
df_zh = df_zh[['#', 'zh', 'en', 'hu'] + [col for col in df_zh.columns if col not in ['#', 'zh', 'en' , 'hu']]]

# Merge df_zh and df_results on the 'hu' column
df_zh = pd.merge(df_zh, df_results, left_on='zh', right_on='item', how='left')

# Sort all occupations by their average ratings
df_zh = df_zh.sort_values(by='mean', ascending=False)

# Drop redundant columns
df_zh.drop(columns=['#', 'item', 't_stat'], inplace=True)

# Rename the columns for clarity
df_zh.rename(columns={'mean': 'mean_zh',
                      't_stat': 't_stat_zh',
                      'std': 'std_zh',
                      'p_value': 'p_value_zh',
                      'significant': 'significant_zh'}, inplace=True)

# Save df_zh as an Excel file
# df_zh.to_excel('occupations_zh.xlsx', index=False)

# Show the final DataFrame
df_zh.head()

#Show
df_zh.tail()

Unnamed: 0,zh,en,hu,0,1,2,3,4,5,6,...,deepseek_hu,gemini_hu,chatgpt_zh,copilot_zh,deepseek_zh,gemini_zh,mean_zh,std_zh,p_value_zh,significant_zh
34,保安,security guard,biztonsági őr,-2,-1,-2,-1,-2,-2,-1,...,-2.7,-2.132,-2.25,-2.1,-2.12,-2.1,-1.791667,0.72106,1.660571e-11,True
43,消防员,firefighter,tűzoltó,-2,-1,-2,-2,-2,-2,-2,...,-2.95,-2.985,-2.25,-2.525,-2.58,-2.5,-1.791667,0.72106,1.660571e-11,True
0,Age,,,<25,<25,<25,<25,<25,<25,<25,...,,,,,,,,,,
1,Gender,,,male,female,female,female,male,female,female,...,,,,,,,,,,
48,Mean Rating,,,-0.25,0.15,-0.4,-0.25,-0.35,-0.25,0.225,...,,,,,,,,,,


In [33]:
# Show only rating rows for teacher
df_zh[df_zh['en'] == 'student'].iloc[:, 4:-12]

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,15,16,17,18,19,20,21,22,23,#_zh
39,0,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [34]:
# Get the top 5 words with the smallest standard deviation from df_hu
top5_smallest_std = df_zh.nsmallest(5, 'std_zh')[['zh', 'en', 'std_zh']]
print(top5_smallest_std)

     zh             en    std_zh
39   学生        student  0.204124
22  服务员         waiter  0.408248
21  服务员         server  0.408248
26  售票员  ticket seller  0.481543
6    护士          nurse  0.503610


In [35]:
# Find duplicate values in the 'zh' column
duplicates = df_zh['zh'][df_zh['zh'].duplicated(keep=False)]

# Show duplicate rows
duplicate_rows = df_zh[df_zh['zh'].isin(duplicates)]
print("Duplicate rows in 'zh' column:")
print(duplicate_rows[['zh', 'en', 'hu']])

# # If there are duplicates, append (2) to the second and further occurrences
# if not duplicates.empty:
#     # Create a counter for each zh value
#     zh_counts = {}
#     def append_suffix(zh):
#         count = zh_counts.get(zh, 0) + 1
#         zh_counts[zh] = count
#         if count > 1:
#             return f"{zh}(2)"
#         return zh
#     df_zh['zh'] = df_zh['zh'].apply(append_suffix)

# df_zh

# If there are duplicates, remove the second one
if not duplicates.empty:
    # Create a mask for duplicates
    mask = df_zh.duplicated(subset=['zh'], keep='first')
    # Keep only the first occurrence of each duplicate
    df_zh = df_zh[~mask]
# Show the DataFrame after removing duplicates
print("DataFrame after removing duplicates:")
print(df_zh[['zh', 'en', 'hu']])

Duplicate rows in 'zh' column:
     zh      en           hu
6    护士   nurse        ápoló
5    护士   nurse        nővér
22  服务员  waiter       pincér
21  服务员  server  felszolgáló
DataFrame after removing duplicates:
             zh                    en               hu
14           保姆                 nanny            dadus
6            护士                 nurse            ápoló
11           幼师  kindergarten teacher   óvodapedagógus
20          美容师            beautician       kozmetikus
9            前台          receptionist        recepciós
3            秘书             secretary           titkár
30          收银员               cashier        pénztáros
19          导购员        shop assistant      bolti eladó
17           保洁               cleaner         takarító
25          空服员     flight attendant*       stewardess
13           护工             caregiver          gondozó
42           人事          HR personnel            HR-es
8            教师               teacher            tanár
29          家政员  

In [36]:
# Get the top 5 words with the smallest standard deviation from df_hu
top5_smallest_std = df_zh.nsmallest(5, 'std_zh')[['zh', 'en', 'std_zh']]
print(top5_smallest_std)

     zh             en    std_zh
39   学生        student  0.204124
22  服务员         waiter  0.408248
26  售票员  ticket seller  0.481543
6    护士          nurse  0.503610
31   医生         doctor  0.508977


### Plot

In [37]:
# Make an explicit copy of the filtered DataFrame
df_zh_plot = df_zh[~df_zh['zh'].isin(['Age', 'Gender', 'Mean Rating'])].copy()

# Assign the 'bias' column safely
df_zh_plot.loc[:, 'bias'] = df_zh_plot['mean_zh'].apply(
    lambda x: 'Female' if x > mean_max else ('Male' if x < mean_min else 'Neutral')
)

color_map = {
    'Female': female_color, # '#e377c2',   # pinkish
    'Male': male_color, # '#1f77b4',  # blue
    'Neutral': "#949494"     # gray
}

# Sort df_zh_plot by mean value before plotting
df_zh_plot_sorted = df_zh_plot.sort_values('mean_zh', ascending=False)

df_zh_plot_sorted['label_zh'] = df_zh_plot_sorted.apply(
    lambda row: f"{row['zh']} ({row['en']})" if pd.notna(row['en']) else row['zh'], axis=1
)

fig = go.Figure()

for bias in ['Female', 'Neutral', 'Male']:
    subset = df_zh_plot_sorted[df_zh_plot_sorted['bias'] == bias]
    fig.add_trace(go.Bar(
        x=subset['zh'],
        y=subset['mean_zh'],
        error_y=dict(type='data', array=subset['std_zh'], thickness=0.75),
        name=bias,
        marker_color=color_map[bias],
        hovertemplate=(
            'Chinese: %{x}<br>'
            'Mean Rating: %{y:.2f}<br>'
            'Standard Deviation: %{error_y.array:.2f}<br>'
            'English: %{customdata[0]}<br>'
            'Hungarian: %{customdata[1]}<extra></extra>'
        ),
        customdata=subset[['en', 'hu']]
    ))

fig.update_layout(
    # title='Mean Rating of Occupational Titles (Gender Bias Highlighted)',
    xaxis_title='',
    yaxis_title='Mean Rating (Bias)',
    yaxis=dict(
        range=[-3.05, 3.05],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        title='Mean Rating (Bias)'
    ),
    xaxis_tickangle=-45,
    template='plotly_white',
    font = dict(family="Times New Roman, serif", size=16, color='black'),
    margin=dict(l=0, r=0, t=0, b=0),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(220, 220, 220, 0.25)',
    )
)

fig.show()

# Save this as a html and image
fig.write_html('occupations_zh.html')
fig.write_image('occupations_zh.png', scale=3, width=1000, height=400)


In [38]:
# Plot the Chinese data with LLM reference lines using plotly.graph_objects (go)
fig = go.Figure()

# Add bars for each bias category
for bias in ['Female', 'Neutral', 'Male']:
    subset = df_zh_plot_sorted[df_zh_plot_sorted['bias'] == bias]
    fig.add_trace(go.Bar(
        x=subset['zh'],
        y=subset['mean_zh'],
        error_y=dict(type='data', array=subset['std_zh'], thickness=0.75,visible=False),
        name=bias,
        marker_color=color_map[bias],
        hovertemplate=(
            'Chinese: %{x}<br>'
            'Mean Rating: %{y:.2f}<br>'
            'English: %{customdata[0]}<br>'
            'Hungarian: %{customdata[1]}<extra></extra>'
        ),
        customdata=subset[['en', 'zh']]
    ))

# Add Copilot reference line
fig.add_trace(go.Scatter(
    x=df_zh_plot_sorted['zh'],
    y=df_zh_plot_sorted['copilot_zh'],
    mode='lines+markers',
    name='Copilot',
    line=dict(color="#F2AD22", width=2, dash='dot'),
    marker=dict(symbol='pentagon', size=8),
    hovertemplate='Chinese: %{x}<br>English: %{customdata[0]}<br>Copilot: %{y:.2f}<extra></extra>',
    customdata=df_zh_plot_sorted[['en']]
))

# Add ChatGPT reference line
fig.add_trace(go.Scatter(
    x=df_zh_plot_sorted['zh'],
    y=df_zh_plot_sorted['chatgpt_zh'],
    mode='lines+markers',
    name='ChatGPT',
    line=dict(color='#00A67E', width=2, dash='dot'),
    marker=dict(symbol='hexagon', size=8),
    hovertemplate='Chinese: %{x}<br>English: %{customdata[0]}<br>ChatGPT: %{y:.2f}<extra></extra>',
    customdata=df_zh_plot_sorted[['en']]
))

# Add Deepseek reference line
fig.add_trace(go.Scatter(
    x=df_zh_plot_sorted['zh'],
    y=df_zh_plot_sorted['deepseek_zh'],
    mode='lines+markers',
    name='Deepseek',
    line=dict(color='#3C5DFF', width=2, dash='dot'),
    marker=dict(symbol='circle', size=8),
    hovertemplate='Chinese: %{x}<br>English: %{customdata[0]}<br>Deepseek: %{y:.2f}<extra></extra>',
    customdata=df_zh_plot_sorted[['en']]
))

# Add Gemini reference line
fig.add_trace(go.Scatter(
    x=df_zh_plot_sorted['zh'],
    y=df_zh_plot_sorted['gemini_zh'],
    mode='lines+markers',
    name='Gemini',
    line=dict(color='#9177C7', width=2, dash='dot'),
    marker=dict(symbol='star-diamond', size=8),
    hovertemplate='Chinese: %{x}<br>English: %{customdata[0]}<br>Gemini: %{y:.2f}<extra></extra>',
    customdata=df_zh_plot_sorted[['en']]
))

fig.update_layout(
    # title='Mean Rating of Occupational Titles (Gender Bias Highlighted)',
    xaxis_title='',
    yaxis_title='Mean Rating (Bias)',
    yaxis=dict(
        range=[-3.1, 3.1],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        title='Mean Rating (Bias)'
    ),
    xaxis_tickangle=-45,
    template='plotly_white',
    font = dict(family="Times New Roman, serif", size=16, color='black'),
    margin=dict(l=0, r=0, t=20, b=0),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0.95,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(220, 220, 220, 0.25)',
    )
)

fig.show()

# Save as HTML and image
fig.write_html('occupations_zh_with_ai.html')
fig.write_image('occupations_zh_with_ai.png', scale=3, width=1000, height=400)

## Cross-linguistic Comparison

### Check comparability

In [39]:
# Get the set of English occupation names from both dataframes
en_hu = set(df_hu['en'].dropna()) if 'en' in df_hu.columns else set()
en_zh = set(df_zh['en'].dropna()) if 'en' in df_zh.columns else set()

# Items only in Hungarian data
only_in_hu = en_hu - en_zh
# Items only in Chinese data
only_in_zh = en_zh - en_hu
# Items in both
in_both = en_hu & en_zh

print(f"Items only in Hungarian data ({len(only_in_hu)}): {sorted(only_in_hu)}\n")
print(f"Items only in Chinese data ({len(only_in_zh)}): {sorted(only_in_zh)}\n")
print(f"Items in both ({len(in_both)}): {sorted(in_both)}")

Items only in Hungarian data (2): ['PR specialist', 'server']

Items only in Chinese data (3): ['flight attendant*', 'kindergarten teacher', 'nanny']

Items in both (41): ['CEO', 'HR personnel', 'accountant', 'architect', 'beautician', 'caregiver', 'cashier', 'chef', 'cleaner', 'dietitian', 'director', 'doctor', 'engineer', 'farmer', 'firefighter', 'flight attendant', 'gardener', 'hairdresser', 'housekeeper', 'judge', 'lifeguard', 'manager', 'model', 'nurse', 'pathologist', 'pilot', 'police officer', 'professor', 'programmer', 'prosecutor', 'receptionist', 'scientist', 'secretary', 'security guard', 'shop assistant', 'soldier', 'student', 'teacher', 'ticket seller', 'waiter', 'worker']


### Unified dataframe

In [40]:
# Create a unified DataFrame with all unique occupation words (by English name)
all_en = sorted(en_hu | en_zh)

# Merge Hungarian and Chinese data on 'en' (English occupation name)
df_hu_part = df_hu[['en', 'hu', 'mean_hu', 'significant_hu']].copy()
df_zh_part = df_zh[['en', 'zh', 'mean_zh', 'significant_zh']].copy()

# Outer merge to include all occupations from both datasets
df_unified = pd.DataFrame({'en': all_en})
df_unified = df_unified.merge(df_hu_part, on='en', how='left')
df_unified = df_unified.merge(df_zh_part, on='en', how='left')

# Mark in a new column if both are significant
df_unified['both_significant'] = df_unified.apply(
    lambda row: row['significant_hu'] and row['significant_zh'] if pd.notna(row['significant_hu']) and pd.notna(row['significant_zh']) else False, axis=1)

# Drop significant_hu and significant_zh columns
df_unified.drop(columns=['significant_hu', 'significant_zh'], inplace=True)

# Add new column that shows the difference between mean_hu and mean_zh
df_unified['mean_difference'] = df_unified.apply(
    lambda row: row['mean_hu'] - row['mean_zh'] if pd.notna(row['mean_hu']) and pd.notna(row['mean_zh']) else None, axis=1)

# Sort by this
df_unified = df_unified.sort_values(by='mean_difference', ascending=False)

# Count and print the number of unique occupations in the unified DataFrame
num_unique_occupations = df_unified['en'].nunique()
print(f'Number of unique occupations in unified DataFrame: {num_unique_occupations}')

# Export this as an Excel file
df_unified.to_excel('occupations_unified.xlsx', index=False)

# Show the unified DataFrame
df_unified.head()

Number of unique occupations in unified DataFrame: 46


Unnamed: 0,en,hu,mean_hu,zh,mean_zh,both_significant,mean_difference
19,hairdresser,fodrász,1.0,理发师,-0.75,True,1.75
20,housekeeper,házvezető,1.772727,家政员,0.5,True,1.272727
16,flight attendant,légiutas-kísérő,1.454545,乘务员,0.458333,True,0.996212
5,beautician,kozmetikus,2.272727,美容师,1.458333,True,0.814394
25,model,modell,1.045455,模特,0.333333,True,0.712121


### Plot comparison

In [41]:

# # Prepare comparison DataFrame for occupations present in both datasets
# df_compare = df_hu[df_hu['en'].isin(in_both)][['en', 'mean_hu', 'hu']].merge(
#     df_zh[df_zh['en'].isin(in_both)][['en', 'mean_zh', 'zh']], on='en', suffixes=('_hu', '_zh')
# )

# # Sort by the average of the two means for better visualization
# df_compare['mean_avg'] = (df_compare['mean_hu'] + df_compare['mean_zh']) / 2
# df_compare = df_compare.sort_values('mean_avg', ascending=False)

# # Create bar plot
# fig = go.Figure()

# fig.add_trace(go.Bar(
#     x=df_compare['en'],
#     y=df_compare['mean_hu'],
#     name='Hungarian',
#     marker_color = hungarian_color,
#     hovertemplate='Hungarian: %{customdata[0]}<br>Mean: %{y:.2f}',
#     customdata=df_compare[['hu']]
# ))

# fig.add_trace(go.Bar(
#     x=df_compare['en'],
#     y=df_compare['mean_zh'],
#     name='Chinese',
#     marker_color = chinese_color,
#     hovertemplate='Chinese: %{customdata[0]}<br>Mean: %{y:.2f}',
#     customdata=df_compare[['zh']]
# ))

# fig.update_layout(
#     barmode='group',
#     # title='Comparison of Occupational Gender Bias Ratings by Language (Hungarian vs. Chinese)',
#     template='plotly_white',
#     margin=dict(l=0, r=0, t=20, b=0),
#     font = dict(family="Times New Roman, serif", size=16, color='black'),
#     xaxis_title='',
#     xaxis_tickangle=-45,
#     yaxis_title='Mean Rating',
#     yaxis=dict(
#         range=[-3.1, 3.1],
#         tickvals=[-3, -2, -1, 0, 1, 2, 3],
#         title='Mean Rating (Bias)',
#         ),
#     legend=dict(
#         orientation='h',
#         yanchor='bottom',
#         y=0,
#         xanchor='center',
#         x=0.5,
#         bgcolor='rgba(220, 220, 220, 0.25)',
#         ),
#     )

# # Show the plot
# fig.show()

# # Save as html
# fig.write_html('occupations_comparison.html')

# # Save as image
# fig.write_image('occupations_comparison.png', scale=3, width=1000, height=400)

### Test significance of the differences

In [42]:
# Delete the last 7 clumns from df_hu and df_zh
columns_to_drop_hu = ['chatgpt_hu', 'copilot_hu', 'gemini_hu', 'deepseek_hu', 'chatgpt_zh', 'copilot_zh', 'gemini_zh', 'deepseek_zh', 'mean_hu', 'p_value_hu', 'std_hu', 'significant_hu']
columns_to_drop_zh = ['chatgpt_hu', 'copilot_hu', 'gemini_hu', 'deepseek_hu', 'chatgpt_zh', 'copilot_zh', 'gemini_zh', 'deepseek_zh', 'mean_zh', 'p_value_zh', 'std_zh', 'significant_zh']
df_hu_ratings = df_hu.drop(columns=columns_to_drop_hu)
df_zh_ratings = df_zh.drop(columns=columns_to_drop_zh)

# Drop rows of Gender, Age, and Mean Rating from df_hu_ratings and df_zh_ratings
df_hu_ratings = df_hu_ratings[~df_hu_ratings['hu'].isin(['Gender', 'Age', 'Mean Rating'])]
df_zh_ratings = df_zh_ratings[~df_zh_ratings['zh'].isin(['Gender', 'Age', 'Mean Rating'])]

df_hu_ratings.head()
df_zh_ratings.head()

Unnamed: 0,zh,en,hu,0,1,2,3,4,5,6,...,15,16,17,18,19,20,21,22,23,#_zh
14,保姆,nanny,dadus,1,1,2,2,2,1,2,...,2,1,1,1,1,1,3,1,1,
6,护士,nurse,ápoló,1,2,2,2,2,1,1,...,2,2,1,1,1,1,2,2,1,
11,幼师,kindergarten teacher,óvodapedagógus,1,2,2,2,2,2,2,...,2,2,1,1,1,2,3,0,2,
20,美容师,beautician,kozmetikus,2,2,2,0,1,1,2,...,2,1,0,2,0,1,2,2,2,
9,前台,receptionist,recepciós,0,0,2,1,2,1,3,...,2,0,1,2,0,1,3,0,2,


In [43]:
common_en = sorted(
    set(df_hu['en'].dropna())
    .intersection(df_zh['en'].dropna())
)
common_en

# Print length of common_en
print(f"Number of common English occupations: {len(common_en)}")

Number of common English occupations: 41


In [44]:
# Get a list of occupation in english that are common in both Hungarian and Chinese datasets
common_en = sorted(
    set(df_hu['en'].dropna())
    .intersection(df_zh['en'].dropna())
)

results = []

for en_name in common_en:
    # Get the occupation row for each language
    hu_row = df_hu_ratings[df_hu_ratings['en'] == en_name]
    zh_row = df_zh_ratings[df_zh_ratings['en'] == en_name]
    
    # Get raw ratings (drop non-numeric columns)
    hu_ratings = hu_row.drop(columns=['en', 'hu', 'zh'], errors='ignore').values.flatten()
    zh_ratings = zh_row.drop(columns=['en', 'hu', 'zh'], errors='ignore').values.flatten()
    
    # Remove NaNs and convert to float
    hu_ratings = pd.to_numeric(pd.Series(hu_ratings).dropna(), errors='coerce')
    zh_ratings = pd.to_numeric(pd.Series(zh_ratings).dropna(), errors='coerce')
    
    # Calculate means
    mean_hu = hu_ratings.mean()
    mean_zh = zh_ratings.mean()
    diff = mean_hu - mean_zh
    
    # Perform t-test
    t_stat, p_value = ttest_ind(hu_ratings, zh_ratings, equal_var=False)
    print(f"Processing {en_name}: hu mean={mean_hu:.2f}, zh mean={mean_zh:.2f}, diff={diff:.2f}, t_stat={t_stat:.2f}, p_value={p_value:.4g}")
    
    results.append({
        'en': en_name,
        'hu': hu_row['hu'].values[0] if not hu_row.empty else None,
        'zh': zh_row['zh'].values[0] if not zh_row.empty else None,
        'mean_hu': mean_hu,
        'mean_zh': mean_zh,
        'mean_difference': diff,
        't_stat': t_stat,
        'p_value': p_value,
        'significant': p_value is not None and p_value < 0.05,
        'marginally_significant': p_value is not None and 0.05 < p_value < 0.1
    })

# Create a DataFrame from the results
df_plot = pd.DataFrame(results)

# Create axis labels: bold for significant*, bold+ for marginally significant+
def occupation_label(row):
    label = row['en']
    if row['significant']:
        label = f"<b>{label}*</b>"
    elif row['marginally_significant']:
        label = f"<b>{label}+</b>"
    return label

df_plot['en_label'] = df_plot.apply(occupation_label, axis=1)

# Sort by the mean average
df_plot['mean_avg'] = (df_plot['mean_hu'] + df_plot['mean_zh']) / 2
df_plot = df_plot.sort_values('mean_avg', ascending=False)

fig = go.Figure()

fig.add_trace(go.Bar(
    x = df_plot['en_label'],
    y = df_plot['mean_hu'],
    name = 'Hungarian',
    marker_color = hungarian_color,
    hovertemplate = 'Hungarian: %{customdata[0]}<br>Mean: %{y:.2f}<br>p-value: %{customdata[1]:.4g}',
    customdata=df_plot[['hu', 'p_value']]
))

fig.add_trace(go.Bar(
    x = df_plot['en_label'],
    y = df_plot['mean_zh'],
    name = 'Chinese',
    marker_color = chinese_color,
    hovertemplate = 'Chinese: %{customdata[0]}<br>Mean: %{y:.2f}<br>p-value: %{customdata[1]:.4g}',
    customdata = df_plot[['zh', 'p_value']]
))

fig.update_layout(
    barmode='group',
    # title='Comparison of Occupational Gender Bias Ratings by Language (Hungarian vs. Chinese)',
    template='plotly_white',
    margin=dict(l=0, r=0, t=20, b=0),
    font = dict(family="Times New Roman, serif", size=16, color='black'),
    xaxis_title='',
    xaxis_tickangle=-45,
    yaxis_title='Mean Rating (Bias)',
    yaxis=dict(
        range=[-3.05, 3.05],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        ),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0,
        xanchor='center',
        x=0.5,
        bgcolor='rgba(220, 220, 220, 0.25)',
        ),
    )

# Show the plot
fig.show()

# Save as html and image
fig.write_html('occupations_comparison.html')
fig.write_image('occupations_comparison.png', scale=3, width=1000, height=400)

Processing CEO: hu mean=-1.27, zh mean=-0.88, diff=-0.40, t_stat=-1.87, p_value=0.06835
Processing HR personnel: hu mean=0.95, zh mean=0.58, diff=0.37, t_stat=1.51, p_value=0.1379
Processing accountant: hu mean=0.50, zh mean=0.50, diff=0.00, t_stat=0.00, p_value=1
Processing architect: hu mean=-1.18, zh mean=-0.71, diff=-0.47, t_stat=-1.93, p_value=0.06014
Processing beautician: hu mean=2.27, zh mean=1.46, diff=0.81, t_stat=3.91, p_value=0.0003203
Processing caregiver: hu mean=1.00, zh mean=0.62, diff=0.38, t_stat=1.35, p_value=0.1836
Processing cashier: hu mean=1.05, zh mean=0.88, diff=0.17, t_stat=0.75, p_value=0.4575
Processing chef: hu mean=-0.86, zh mean=-1.04, diff=0.18, t_stat=0.83, p_value=0.413
Processing cleaner: hu mean=1.14, zh mean=0.71, diff=0.43, t_stat=1.44, p_value=0.1576
Processing dietitian: hu mean=0.68, zh mean=0.21, diff=0.47, t_stat=2.59, p_value=0.01307
Processing director: hu mean=-0.86, zh mean=-0.71, diff=-0.16, t_stat=-0.75, p_value=0.4568
Processing doctor:

In [45]:
# Prepare comparison DataFrame for occupations present in both datasets
df_compare = df_hu[df_hu['en'].isin(in_both)][['en', 'mean_hu', 'hu']].merge(
    df_zh[df_zh['en'].isin(in_both)][['en', 'mean_zh', 'zh']], on='en', suffixes=('_hu', '_zh'))

# Sort by the average of the two means for better visualization
df_compare['mean_avg'] = (df_compare['mean_hu'] + df_compare['mean_zh']) / 2
df_compare = df_compare.sort_values('mean_avg', ascending=False)

# Add a new column 'bias' considering absolute values: 
# 'Hungarian' if abs(mean_hu) > abs(mean_zh), 'Chinese' if abs(mean_hu) < abs(mean_zh), 'Equal' if they are the same
df_compare['bias'] = df_compare.apply(
    lambda row: 'Hungarian' if abs(row['mean_hu']) > abs(row['mean_zh']) 
    else ('Chinese' if abs(row['mean_hu']) < abs(row['mean_zh']) else 'Equal'), axis=1
)

# Count values in 'bias' column and print the results
bias_counts = df_compare['bias'].value_counts()
print("Bias counts:")
print(bias_counts)

# Plot this
fig = px.bar(
    bias_counts,
    x=bias_counts.index,
    y=bias_counts.values,
    title='Count of Bias by Occupation',
    labels={'x': 'Bias', 'y': 'Count'},
    color=bias_counts.index,
    color_discrete_map={'Hungarian': hungarian_color, 'Chinese': chinese_color, 'Equal': neutral_color}
)
fig.update_layout(
    xaxis_title='Bias',
    yaxis_title='Count',
    template='plotly_white'
)
fig.show()

Bias counts:
bias
Hungarian    30
Chinese      11
Equal         1
Name: count, dtype: int64
