In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import json
import numpy as np

In [11]:
df = pd.read_parquet('../data/SO_2014_2022.pq')

# Limit to salary < 250.000
df = df[(df['Salary'] < 250000)]

df.head()

Unnamed: 0,Year,Salary,JobSat,YearsCode,YearsCodePro,Age,Education,OrgSize,LastNewJob,Employment,RespondentType,JobSeek,Gender,Student,Country,CodingActivities,DevType,LearnCodeFrom,LangPresent
0,2022,69318.0,,10,5.0,25-34,master,500 to 999 employees,,fulltime,dev,,male,no,Germany,School or academic work,Data scientist or machine learning specialist;...,"Books / Physical media;School (i.e., Universit...",C;C++;Java;JavaScript;MATLAB;Python;Scala;SQL;...
6,2022,27652.0,,18,10.0,25-34,bachelor,"1,000 to 4,999 employees",,fulltime,dev,,male,no,Colombia,Hobby,"Developer, full-stack;Developer, back-end",Books / Physical media;Other online resources ...,Bash/Shell/PowerShell;Elixir;HTML/CSS;JavaScri...
9,2022,15431.0,,5,5.0,25-34,bachelor,20 to 99 employees,,fulltime,dev,,male,no,Ghana,Freelance/contract work,"Developer, back-end",On the job training;Coding Bootcamp,JavaScript;Ruby
13,2022,47352.0,,7,7.0,45-54,master,10 to 19 employees,,fulltime,non-dev,,male,no,Belgium,Hobby,"Developer, back-end;Educator or academic;Datab...",Books / Physical media;On the job training;Col...,Delphi;SQL
22,2022,78084.0,,25,25.0,45-54,bachelor,500 to 999 employees,,fulltime,non-dev,,male,no,Canada,Hobby;Contribute to open-source projects,"Engineer, site reliability;Security professional",Books / Physical media;Other online resources ...,Bash/Shell/PowerShell;C;JavaScript;Perl;PHP;Py...


In [16]:
def get_wage_gap_by_age(age_range: str) -> pd.DataFrame:
    # Get male salary distribution per country.
    salary_men_df = df[(df['Gender'] == 'male') & (df['Age'] == age_range)] \
        .groupby(['Country'], as_index=False) \
        .agg({ 'Salary': 'mean' }) \
        .rename(columns={'Salary': 'SalaryMen'})
    
    # Get female salary distribution per country.
    salary_women_df = df[(df['Gender'] == 'female') & (df['Age'] == age_range)] \
        .groupby(['Country'], as_index=False) \
        .agg({ 'Salary': 'mean' }) \
        .rename(columns={'Salary': 'SalaryWomen'})

    # Calculate wage gap.
    new_df = pd.merge(salary_men_df, salary_women_df, on='Country')
    new_df['WageGap'] = abs(new_df['SalaryMen'] - new_df['SalaryWomen'])        
    
    return new_df 


def make_title(age_range: str) -> str:
    """Create the plot title, given the age range."""
    return f'Global wage gap distribution in dollars between men and women ({age_range} years old)'

In [17]:
# Gather all age ranges.
age_bins = df['Age'].unique().dropna().sort_values()

# Get the wage gap dataframe, given the first age range,
# which is the slider's first value.
wage_gap_df = get_wage_gap_by_age(age_bins[0])

# Plot the initial map.
map_fig = px.choropleth(
    wage_gap_df,
    locations='Country',
    locationmode='country names',
    color_continuous_scale='OrRd',
    range_color=[0, wage_gap_df['WageGap'].max()],
    color='WageGap',
    hover_data={"WageGap": ":$d"},
    title=make_title(age_bins[0]),
)

# Some additional map config.
map_fig.update_geos(showcountries=True, showcoastlines=False)
map_fig.update_layout(geo={'showocean': True, 'oceancolor': '#a8d5f2'})

# Create the slider.
steps = []
for age_range in age_bins:
    wage_gap_df = get_wage_gap_by_age(age_range)
    step = dict(
        method='update',
        args=[
            {'z': [wage_gap_df['WageGap']]},
            {'title': make_title(age_range)},
        ],
        label=age_range
    )
    steps.append(step)

map_fig.update_layout(
    sliders=[dict(
        active=0,
        currentvalue={'prefix': 'Age: '},
        steps=steps
    )],
    margin={'t': 40, 'r': 0, 'b': 0, 'l': 10}
)

map_fig.show()