In [2]:
%run set_theme.ipynb

In [3]:
import pandas as pd
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [4]:
df = pd.read_parquet('../data/SO_2014_2022.pq')
df = df[(df['Salary'] > 0) & (df['Salary'] < 250000)]

df.head()

Unnamed: 0,Year,Salary,JobSat,YearsCode,YearsCodePro,Age,Education,OrgSize,LastNewJob,Employment,RespondentType,JobSeek,Gender,Student,Country,CodingActivities,DevType,LearnCodeFrom,LangPresent
0,2022,69318.0,,10,5,25-34,master,500 to 999 employees,,fulltime,dev,,male,no,Germany,School or academic work,Data scientist or machine learning specialist;...,"Books / Physical media;School (i.e., Universit...",C;C++;Java;JavaScript;MATLAB;Python;Scala;SQL;...
6,2022,27652.0,,18,10,25-34,bachelor,"1,000 to 4,999 employees",,fulltime,dev,,male,no,Colombia,Hobby,"Developer, full-stack;Developer, back-end",Books / Physical media;Other online resources ...,Bash/Shell/PowerShell;Elixir;HTML/CSS;JavaScri...
9,2022,15431.0,,5,5,25-34,bachelor,20 to 99 employees,,fulltime,dev,,male,no,Ghana,Freelance/contract work,"Developer, back-end",On the job training;Coding Bootcamp,JavaScript;Ruby
13,2022,47352.0,,7,7,45-54,master,10 to 19 employees,,fulltime,non-dev,,male,no,Belgium,Hobby,"Developer, back-end;Educator or academic;Datab...",Books / Physical media;On the job training;Col...,Delphi;SQL
22,2022,78084.0,,25,25,45-54,bachelor,500 to 999 employees,,fulltime,non-dev,,male,no,Canada,Hobby;Contribute to open-source projects,"Engineer, site reliability;Security professional",Books / Physical media;Other online resources ...,Bash/Shell/PowerShell;C;JavaScript;Perl;PHP;Py...


In [6]:
# calculate mean for each of the age bins
# plot these means for all age bins

male_age_salary_df = df.query('Gender == "male"') \
    .groupby(['Gender', 'Age']) \
    .agg({'Salary': 'mean'}) \
    .reset_index()

female_age_salary_df = df.query('Gender == "female"') \
    .groupby(['Gender', 'Age']) \
    .agg({'Salary': 'mean'}) \
    .reset_index()

age_salary_df = pd.concat([male_age_salary_df, female_age_salary_df])

fig = px.bar(
    age_salary_df,
    y='Age',
    x='Salary',
    title='Mean Salary<br><sup>salary increases as people are getting older</sup>',
    hover_data={'Salary': ':d', 'Age': False},
    orientation='h',
    barmode='group',
    color='Gender',
    color_discrete_map={
        'male': '#645cff',
        'female': '#f25fe9'
    },
    width=790,
)

fig.for_each_trace(lambda t: t.update(hovertemplate='<b>' + t.name.capitalize() + '</b><br>Average salary: %{x:d}</br>Age: %{y}<extra></extra>',
                                      hoverlabel={'font_color': 'white', 'bordercolor': 'white'}))

fig.add_annotation(x=1, y=0.025,
                   xref="paper", yref="paper",
                   align='right',
                   showarrow=False,
                   xanchor='right', yanchor='bottom',
                   text='mean salary per age segment for each of the two genders')

fig.show()