In [310]:
# Import libraries
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [311]:
# Read files
occupations = pd.read_excel('occupations.xlsx')
occupations


Unnamed: 0,#,zh,hu,en,att
0,1,模特,modell,model,
1,2,军人,katona,soldier,
2,3,法医,kórboncnok,pathologist,
3,4,董事长,vezérigazgató,CEO,
4,5,高管,menedzser,manager,
5,6,护士,nővér,nurse,
6,7,厨师,szakács,chef,
7,8,服务员,pincérnő,waitress,yes
8,9,服务员,felszolgáló,waiter,
9,10,会计,könyvelő,accountant,


In [312]:
# Read in Hungarian ratings
df_hu = pd.read_excel('hu.xlsx')

In [313]:
# Define the mapping for ratings
rating_map = {
    'Teljesen férfi': -3,
    'Nagyrészt férfi': -2,
    'Inkább férfi': -1,
    'Semleges/egyenlő': 0,
    'Inkább női': 1,
    'Nagyrészt női': 2,
    'Teljesen női': 3
}

# Get columns to convert (skip non-rating columns)
rating_columns = df_hu.columns[8:]  # assuming first 8 columns are not ratings

# Replace and explicitly infer objects to avoid warning
df_hu[rating_columns] = df_hu[rating_columns].replace(rating_map).infer_objects(copy=False)
df_hu


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



Unnamed: 0,ID,Start time,Completion time,Email,Name,Prolific ID,Életkor,Nem,modell,katona,...,tanár,rendőr,pilóta,házvezetőnő,recepciós,biztonsági őr,ügyész,kozmetikus,programozó,diák
0,1,2025-07-11 12:06:52,2025-07-11 12:07:43,anonymous,,5ef60257cd680928de23ccae,25-35,férfi,2,-3,...,0,-2,-1,3,0,-2,0,2,-1,0
1,2,2025-07-11 12:06:58,2025-07-11 12:13:33,anonymous,,5c48be0496d59b000183e68d,45-55,férfi,2,-2,...,0,-2,-2,3,1,-2,-1,3,-2,0
2,3,2025-07-11 12:14:03,2025-07-11 12:16:52,anonymous,,5a913d2cf0536100017196d8,25-35,férfi,0,-2,...,0,-1,0,3,0,-2,0,2,0,0
3,4,2025-07-11 12:12:24,2025-07-11 12:21:19,anonymous,,5d3449524e8363001735fc41,35-45,férfi,2,-2,...,2,-2,-2,3,2,-2,-1,3,-2,0
4,5,2025-07-11 12:16:52,2025-07-11 12:21:35,anonymous,,5d3873197860c8001a106e02,25-35,férfi,0,-2,...,0,-1,-1,3,0,-1,0,2,-2,0
5,6,2025-07-11 12:40:21,2025-07-11 12:44:07,anonymous,,5af82cf9e066a40001136ace,25-35,férfi,2,-2,...,0,-2,-2,3,0,-2,0,2,-2,0
6,7,2025-07-11 12:43:44,2025-07-11 12:47:23,anonymous,,6167d7e71526370c959d164b,35-45,nő,2,-2,...,-1,-1,-1,3,1,-2,-1,2,-1,0
7,8,2025-07-11 12:41:19,2025-07-11 12:47:45,anonymous,,60245c7c3df42d131232cedb,45-55,nő,1,-2,...,2,-2,-2,3,1,-2,-2,3,-2,0
8,9,2025-07-11 12:44:16,2025-07-11 12:48:37,anonymous,,5df2235d5a34251266ea645e,25-35,férfi,1,-2,...,-3,-2,-2,3,2,-2,-2,2,-2,0
9,10,2025-07-11 12:54:41,2025-07-11 12:56:58,anonymous,,599494e7bf8bcf0001ab6973,35-45,férfi,2,-2,...,0,-1,-2,3,2,-2,-1,2,-1,0


In [314]:
# Show me the ratio of "Nem" (gender) in this survey.
gender = df_hu['Nem'].value_counts(normalize=True) * 100

# Show me the ratios of "Életkor" (age) in this survey.
age = df_hu['Életkor'].value_counts(normalize=True) * 100

# Prepare data for gender pie chart
gender_pie = go.Pie(
    labels=gender.index,
    values=gender.values,
    name='Gender',
    hole=0.4,
    title='Gender'
)

# Prepare data for age pie chart
age_pie = go.Pie(
    labels=age.index,
    values=age.values,
    name='Age',
    hole=0.4,
    title='Age'
)

# Create subplot with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                    subplot_titles=['Gender Distribution', 'Age Distribution'])

fig.add_trace(gender_pie, 1, 1)
fig.add_trace(age_pie, 1, 2)

fig.update_traces(textinfo='percent+label')
fig.update_layout(title_text='Gender and Age Distribution')
fig.show()

In [315]:
# Transpose results
df_hu = df_hu[rating_columns].transpose()

# Show the number of rows
print(f"Number of rows in transposed DataFrame: {df_hu.shape[0]}")

# In a new column of the transposed df, calculate the average rating for each occupation
df_hu['hu_rating'] = df_hu.mean(axis=1)

# Make the index into a column called hu
df_hu = df_hu.reset_index().rename(columns={'index': 'hu'})

# Show
df_hu.head()

Number of rows in transposed DataFrame: 50


Unnamed: 0,hu,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,hu_rating
0,modell,2,2,0,2,0,2,2,1,1,...,0,1,2,0,0,1,2,0,0,1.052632
1,katona,-3,-2,-2,-2,-2,-2,-2,-2,-2,...,-1,-2,-1,0,-1,-2,-2,-1,-2,-1.736842
2,kórboncnok,0,-1,0,-1,-2,-1,-1,-2,-2,...,0,-1,-2,0,-1,0,0,0,0,-0.736842
3,vezérigazgató,-1,-1,-1,-1,0,-1,-1,-2,-2,...,-1,-1,0,0,-2,-2,-2,0,-2,-1.157895
4,menedzser,-1,0,-1,0,0,0,-1,-1,-2,...,0,-1,0,0,-1,0,-2,0,-1,-0.578947


In [316]:
# Merge df_hu and occupations on the 'hu' column
df_hu = pd.merge(df_hu, occupations, on='hu', how='left')
df_hu.head()

# Reorder columns so the dataframe starts with 'hu' 'en', 'zh', and so on
df_hu = df_hu[['#', 'hu', 'en', 'zh'] + [col for col in df_hu.columns if col not in ['#', 'hu', 'en' , 'zh']]]

# Sort all occupations by their average ratings
df_hu = df_hu.sort_values(by='hu_rating', ascending=False)
df_hu

Unnamed: 0,#,hu,en,zh,0,1,2,3,4,5,...,11,12,13,14,15,16,17,18,hu_rating,att
33,34,ápolónő,nurse (female),护士,3,3,3,3,3,3,...,3,2,3,3,3,3,3,3,2.947368,yes
7,8,pincérnő,waitress,服务员,3,3,3,3,3,3,...,3,1,3,3,3,3,3,3,2.894737,yes
22,23,tanárnő,teacher (female),教师,3,3,3,3,3,3,...,3,1,3,3,3,3,3,3,2.894737,yes
43,44,házvezetőnő,housekeeper (female),家政员,3,3,3,3,3,3,...,3,1,3,3,3,3,3,3,2.894737,yes
29,30,takarítónő,cleaning lady,保洁,3,3,3,3,3,3,...,3,1,3,2,3,3,3,3,2.842105,yes
18,19,titkárnő,secretary (female),秘书,3,3,2,3,3,3,...,3,2,3,3,3,3,3,3,2.578947,yes
47,48,kozmetikus,beautician,美容师,2,3,2,3,2,2,...,2,2,1,2,2,3,2,3,2.210526,
5,6,nővér,nurse,护士,3,3,2,3,3,3,...,2,1,1,2,3,3,1,1,2.210526,
26,27,házvezető,housekeeper,家政员,1,2,0,3,2,2,...,2,0,2,2,2,3,2,3,1.789474,
27,28,légiutas-kísérő,flight attendant,乘务员,1,2,1,3,0,2,...,2,2,0,1,0,2,1,1,1.368421,


In [317]:
# Remove attention checks by removing rows from df_hu where 'att' columns=yes
df_hu = df_hu[~df_hu['att'].str.contains('yes', na=False)]
df_hu

Unnamed: 0,#,hu,en,zh,0,1,2,3,4,5,...,11,12,13,14,15,16,17,18,hu_rating,att
47,48,kozmetikus,beautician,美容师,2,3,2,3,2,2,...,2,2,1,2,2,3,2,3,2.210526,
5,6,nővér,nurse,护士,3,3,2,3,3,3,...,2,1,1,2,3,3,1,1,2.210526,
26,27,házvezető,housekeeper,家政员,1,2,0,3,2,2,...,2,0,2,2,2,3,2,3,1.789474,
27,28,légiutas-kísérő,flight attendant,乘务员,1,2,1,3,0,2,...,2,2,0,1,0,2,1,1,1.368421,
24,25,takarító,cleaner,保洁,1,2,0,3,0,1,...,2,1,0,2,2,2,0,2,1.157895,
34,35,gondozó,caretaker,护工,0,2,0,3,0,2,...,2,1,0,2,1,0,0,2,1.105263,
14,15,pénztáros,cashier,收银员,1,2,0,2,0,2,...,1,1,0,1,2,0,0,1,1.052632,
0,1,modell,model,模特,2,2,0,2,0,2,...,1,2,0,0,1,2,0,0,1.052632,
35,36,bolti eladó,"shop assistant, seller",导购员,1,2,0,2,0,1,...,0,1,0,1,2,0,0,2,1.052632,
25,26,HR-es,HR specialist,人力資源,0,1,0,2,0,1,...,2,2,0,2,1,0,0,2,1.0,


In [318]:
import plotly.express as px

fig = px.bar(
    df_hu,
    x='hu',
    y='hu_rating',
    title='Average Rating by Occupation',
    labels={'hu': 'Occupation', 'rating': 'Average Rating'},
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

In [319]:
# Add a color column based on rating sign: feminine (rating > 0), masculine (rating < 0), neutral (rating == 0)
df_hu['bias'] = df_hu['hu_rating'].apply(
    lambda x: 'Feminine' if x > 1 else ('Masculine' if x < -1 else 'Neutral')
)

color_map = {
    'Feminine': '#e377c2',   # pinkish
    'Masculine': '#1f77b4',  # blue
    'Neutral': '#7f7f7f'     # gray
}

fig = px.bar(
    df_hu,
    x='hu',
    y='hu_rating',
    color='bias',
    color_discrete_map = color_map,
    title='Average Rating by Occupation (Gender Bias Highlighted)',
    labels={'hu': 'Hungarian', 'en': 'English', 'bias': 'Bias', 'hu_rating': 'Rating',},
    hover_data=['hu', 'en', 'hu_rating']
)
fig.update_layout(
    xaxis_tickangle=-45,
    yaxis=dict(
        range=[-3, 3],
        tickvals=[-3, -2, -1, 0, 1, 2, 3],
        title='Average Rating'
    )
)
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [320]:
# Save this as a html file
fig.write_html('hu_occupations.html')
