In [1]:
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd

# from plotly.offline import init_notebook_mode
# init_notebook_mode(connected = True)

In [5]:
df = pd.read_parquet('../data/SO_2014_2022.pq')
avg_salary = df.groupby(['YearsCodePro', 'Gender'])['Salary'].mean().reset_index()
avg_salary

Unnamed: 0,YearsCodePro,Gender,Salary
0,0,female,39553.009805
1,0,male,32276.145762
2,1,female,39444.444886
3,1,male,32361.590644
4,2,female,41009.217794
...,...,...,...
95,48,male,114928.0
96,49,female,8.0
97,49,male,117607.3
98,50,female,133300.0


In [38]:
smoothed_avg = pd.DataFrame(columns=['Gender', 'Salary'])

for gender in ['male', 'female']:
    start_salary = df.query(f"YearsCodePro < 2 & Gender == '{gender}'")['Salary'].mean()
    x = list(avg_salary.query(f"Gender == '{gender}'").drop(columns=['Gender']).div({'YearsCodePro':1, 'Salary':start_salary}).sub({'YearsCodePro':0, 'Salary':1}).rolling(3, on='YearsCodePro'))
    smoothed = pd.Series(row.mean()['Salary'].round(3) for row in x[2:]).rename('Salary')
    # smoothed = avg_salary.query(f"Gender == '{gender}'")['Salary'].reset_index(drop=True).div(start_salary).sub(1).rename('Salary')
    gen_col = pd.Series(gender for _ in range(len(smoothed))).rename('Gender')
    smoothed.index += 1
    gen_col.index += 1
    smoothed_avg = pd.concat([smoothed_avg, pd.concat([gen_col, smoothed], axis=1)])

smoothed_avg.reset_index(names='YearsCodePro', inplace=True)
smoothed_avg

Unnamed: 0,YearsCodePro,Gender,Salary
0,1,male,0.029
1,2,male,0.164
2,3,male,0.302
3,4,male,0.530
4,5,male,0.712
...,...,...,...
91,43,female,1.304
92,44,female,1.970
93,45,female,1.407
94,46,female,0.605


In [40]:
fig = px.line(smoothed_avg, x="YearsCodePro", y='Salary', color='Gender', color_discrete_sequence=['#5864ff', '#ef3bc2'],
              range_x=[1, 40], range_y=[0, 3], labels={"male": "Gender", "female": "Smokes"})
fig.update_layout(xaxis_title="Years coded professionally", yaxis_title="Average increase in salary", title='Years of professional coding experience versus increase in salary from starting salary', width=800, yaxis_tickformat='.1%')
fig.update_layout(hovermode="x")
for series in fig.data:
    series.hovertemplate = '<br>Salary increase: %{y}<extra></extra>'

fig.show()