In [1]:
%run set_theme.ipynb

In [2]:
import pandas as pd
import plotly.graph_objects as go

In [3]:
full = pd.read_parquet('../data/SO_2014_2022.pq')

In [4]:
df = full.groupby('Gender')['Education'].value_counts()
res = pd.DataFrame()

res['master/doctorate'] = df.loc[:, 'doctor'] + df.loc[:, 'professional'] + df.loc[:, 'master']
res['bachelor/tertiary'] = df.loc[:, 'bachelor'] + df.loc[:, 'tertiary'] + df.loc[:, 'assoc']
res['secondary'] = df.loc[:, 'secondary']
res['primary'] = df.loc[:, 'primary'] + df.loc[:, 'none']
res = res.cumsum(axis=1)
res.loc['male'] *= 1 / res.loc['male']['primary'] * 100
res.loc['female'] *= 1 / res.loc['female']['primary'] * 100
res

Unnamed: 0_level_0,master/doctorate,bachelor/tertiary,secondary,primary
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,28.899066,91.328082,99.219637,100
male,27.144707,89.210283,98.916874,100


In [14]:
blue_alpha = 'rgba(100,92,255,0.3)'
pink_alpha = 'rgba(242,95,233,0.3)'

blue = '#645cff'
pink = '#f25fe9'

fig = go.Figure(data=[go.Sankey(
    arrangement='snap',
    node={'pad': 15, 'thickness': 20,
          'x': [0.05, 0.33, 0.62, 0.9,
                0.05, 0.33, 0.62, 0.9, -1],
          'y': [0.3, 0.3, 0.735, 0.5865,
                0.2, 0.2, 0.2, 0.05, 0.1],
          'customdata': ["", "All valid answers from male<br> respondents were used",
                         "Male respondents, secondary degree", "Male respondents, tertiary education",
                         "Male respondents, master / doctorate",
                         "All valid answers from female<br> respondents were used",
                         "Female respondents, secondary degree", "Female respondents, tertiary education",
                         "Female respondents, master / doctorate"],
          'hovertemplate': '%{customdata}<extra></extra>',
          'color': ["green", blue, blue, blue, blue, pink, pink, pink, pink]
        },

    link={'source': [1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9],
          'target': [2, 0, 3, 0, 4, 0, 0, 6, 0, 7, 0, 8, 0, 0, 1, 5],
          'value': [
              res.secondary['male'], 0,  # res.primary['male'] - res.secondary['male'],
              res['bachelor/tertiary']['male'], 0,  # res.secondary['male'] - res['bachelor/tertiary']['male'],
              res['master/doctorate']['male'], 0,  # res['bachelor/tertiary']['male'] - res['master/doctorate']['male'],
              0,  #res['master/doctorate']['male'],
              res.secondary['female'], 0,  # res.primary['female'] - res.secondary['female'],
              res['bachelor/tertiary']['female'], 0,  # res.secondary['female'] - res['bachelor/tertiary']['female'],
              res['master/doctorate']['female'], 0,
              # res['bachelor/tertiary']['female'] - res['master/doctorate']['female'],
              0,  #res['master/doctorate']['female'],
              100, 100
          ], 'customdata': ["Almost all male respondents<br>completed secondary education", "",
                            "Most male respondents continued to tertiary education,<br>though slightly less than female respondents",
                            "",
                            "Over a quarter of male respondents continued to a master or doctorate,<br>though slightly less than female respondents",
                            "", "", "Almost all female respondents<br>completed secondary education", "",
                            "Most female respondents continued to tertiary education,<br>though slightly more than female respondents",
                            "",
                            "Over a quarter of female respondents continued to a master or doctorate,<br>though slightly more than male respondents", ],
          'hovertemplate': ' %{customdata}<extra></extra>',
          'arrowlen': 14,
          'color': [blue_alpha, blue_alpha, blue_alpha, blue_alpha, blue_alpha, blue_alpha, blue_alpha,
                    pink_alpha, pink_alpha, pink_alpha, pink_alpha, pink_alpha, pink_alpha, pink_alpha,
                    'rgba(255,255,255,0)', 'rgba(255,255,255,0)', ]
          })])

fig.update_layout(
    title_text="Retention in education system by gender",
    font_size=15,
    modebar_remove=['lasso2d', 'select2d', 'resetsankeygroup']
)

fig.update_traces(
    hoverlabel={
        'font_color': 'white',
    }
)

fig.add_annotation(x=0.5, y=1.13,
                   text="Approximately equal, slightly higher for female respondents",
                   xanchor='center',
                   font={'color':'grey', 'size':10},
                   showarrow=False)

fig.add_annotation(x=-0.02, y=-0.1,
                   text="All respondents",
                   showarrow=False)

fig.add_annotation(x=0.28, y=-0.1,
                   text="Secondary",
                   showarrow=False)

fig.add_annotation(x=0.62, y=-0.1,
                   text="Bachelor/Tertiary",
                   showarrow=False)

fig.add_annotation(x=0.955, y=-0.1,
                   text="Master/PhD",
                   showarrow=False)

fig.add_annotation(x=0.155, y=0.415,
                   text=f"{res.secondary['male']:.1f}%",
                   showarrow=False)

fig.add_annotation(x=0.47, y=0.415,
                   text=f"{res['bachelor/tertiary']['male']:.1f}%",
                   showarrow=False)

fig.add_annotation(x=0.79, y=0.415,
                   text=f"{res['master/doctorate']['male']:.1f}%",
                   showarrow=False)

fig.add_annotation(x=0.155, y=0.965,
                   text=f"{res.secondary['female']:.1f}%",
                   showarrow=False)

fig.add_annotation(x=0.47, y=0.965,
                   text=f"{res['bachelor/tertiary']['female']:.1f}%",
                   showarrow=False)

fig.add_annotation(x=0.79, y=0.965,
                   text=f"{res['master/doctorate']['female']:.1f}%",
                   showarrow=False)

fig.add_annotation(x=-0.07, y=0.80,
                   text="Female",
                   showarrow=False)
fig.add_annotation(x=-0.05, y=0.20,
                   text="Male",
                   showarrow=False)
fig.show()