In [4]:
import pandas as pd
import plotly.express as px

In [135]:
df = pd.read_parquet('../data/SO_2014_2022.pq')

df = df[(df['Salary'] > 0) & (df['Salary'] < 250000)]
df['Year'] = df['Year'].astype(int)

df.head()

Unnamed: 0,Year,Salary,JobSat,YearsCode,YearsCodePro,Age,Education,OrgSize,LastNewJob,Employment,RespondentType,JobSeek,Gender,Student,Country,CodingActivities,DevType,LearnCodeFrom,LangPresent
0,2022,69318.0,,10,5.0,25-34,master,500 to 999 employees,,fulltime,dev,,male,no,Germany,School or academic work,Data scientist or machine learning specialist;...,"Books / Physical media;School (i.e., Universit...",C;C++;Java;JavaScript;MATLAB;Python;Scala;SQL;...
6,2022,27652.0,,18,10.0,25-34,bachelor,"1,000 to 4,999 employees",,fulltime,dev,,male,no,Colombia,Hobby,"Developer, full-stack;Developer, back-end",Books / Physical media;Other online resources ...,Bash/Shell/PowerShell;Elixir;HTML/CSS;JavaScri...
9,2022,15431.0,,5,5.0,25-34,bachelor,20 to 99 employees,,fulltime,dev,,male,no,Ghana,Freelance/contract work,"Developer, back-end",On the job training;Coding Bootcamp,JavaScript;Ruby
13,2022,47352.0,,7,7.0,45-54,master,10 to 19 employees,,fulltime,non-dev,,male,no,Belgium,Hobby,"Developer, back-end;Educator or academic;Datab...",Books / Physical media;On the job training;Col...,Delphi;SQL
22,2022,78084.0,,25,25.0,45-54,bachelor,500 to 999 employees,,fulltime,non-dev,,male,no,Canada,Hobby;Contribute to open-source projects,"Engineer, site reliability;Security professional",Books / Physical media;Other online resources ...,Bash/Shell/PowerShell;C;JavaScript;Perl;PHP;Py...


In [136]:
# Define how many countries to show in the plot.
TOP_N_COUNTRIES = 10
    
# Get the top-n countries with the highest highest mean salary gap.
# =================================================================

# Calculate the salary gap per country.
salary_men_df = df.query('Gender == "male"') \
    .groupby(['Year', 'Country']) \
    .agg({'Salary': 'mean' }) \
    .reset_index() \
    .rename(columns={'Salary': 'SalaryMen'})

salary_women_df = df.query('Gender == "female"') \
    .groupby(['Year', 'Country']) \
    .agg({'Salary': 'mean' }) \
    .reset_index() \
    .rename(columns={'Salary': 'SalaryWomen'})

salary_gap_df = pd.merge(salary_men_df, salary_women_df, on=['Year', 'Country']).dropna()
salary_gap_df['SalaryGap'] = abs(salary_gap_df['SalaryMen'] - salary_gap_df['SalaryWomen'])

# Filter on the countries that have salary gap data in every year. 
countries = salary_gap_df.groupby(['Year'])['Country'].unique()
yearly_occuring_countries = set(countries.iloc[0])
for i in range(1, len(countries)):
    yearly_occuring_countries = yearly_occuring_countries.intersection(set(countries.iloc[i]))

yearly_occuring_countries_df = salary_gap_df[salary_gap_df['Country'].isin(yearly_occuring_countries)]

# Find the countries that have the highest salary gap mean spanning all years.
highest_salary_gap_countries = yearly_occuring_countries_df.groupby(['Country']) \
    .agg({'SalaryGap': 'mean'}) \
    .reset_index() \
    .sort_values('SalaryGap', ascending=False) \
    ['Country'] \
    .tolist()

# Now we can finally get the top-n country names
# having the highest salary gap mean spanning all years.
top_n_countries = highest_salary_gap_countries[:TOP_N_COUNTRIES]

# Now that we know which countries have the highest 
# salary gaps, let's get their salary gap data.
top_n_countries_salary_gap_df = salary_gap_df[salary_gap_df['Country'].isin(top_n_countries)].copy()

# Order the traces based on the top_n_countries + year.
# In this way, the legend will show the countries ordered from
# highest to lowest salary gap mean.
top_n_countries_salary_gap_df['Country'] = pd.Categorical(
    top_n_countries_salary_gap_df['Country'],
    categories=top_n_countries,
    ordered=True
)
top_n_countries_salary_gap_df = top_n_countries_salary_gap_df.sort_values(['Country', 'Year'])

# Finally, plot everything.
fig = px.line(
    top_n_countries_salary_gap_df,
    x='Year',
    y='SalaryGap',
    color='Country',
    hover_data={'SalaryGap': ':$d'},
    title='Countries with the highest salary gaps from 2014 to 2022',
)
fig.show()

# - Show numbers in legend for clarity?
# - maybe take n women and n men? because the differnce might be big because there are much more men than women.