In [8]:
import numpy as np
import pandas as pd
from data_loader import load_names_from_web
import altair as alt

In [219]:
dfraw = load_names_from_web(category='national', hide_pre_1937=True, use_existing_files=True)

first_known_year = dfraw['year'].min()
last_known_year = dfraw['year'].max()

dfraw = dfraw[dfraw['name'] == 'Sarah']

all_years = dfraw.copy()

In [220]:
data = pd.DataFrame()

for year in range(first_known_year, last_known_year+1):

    df = all_years.copy()
    df = df[df['year'] <= year]
    df = df.sort_values(by='year')
    df['cumsum'] = df.groupby(['name', 'M/F'])['count'].cumsum()
    df['sum'] = df.groupby(['name', 'M/F'])['count'].transform('sum')
    medians = df[df['cumsum'] >= df['sum']/2]
    medians = medians.drop_duplicates(subset=['name', 'M/F'], keep='first')
    medians['age'] = year - medians['year']
    thisyear = df[df['year'] == year][['name', 'M/F', 'count']].rename(columns={'count': 'thisyear_count'})
    df = medians.merge(thisyear, how='left', on=['name', 'M/F'])
    df['thisyear_count'] = df['thisyear_count'].fillna(0)
    df = df[['state', 'name', 'M/F', 'sum', 'age', 'thisyear_count']]
    df['year'] = year
    # display(df)
    data = pd.concat([data, df])

data

Unnamed: 0,state,name,M/F,sum,age,thisyear_count,year
0,US,Sarah,F,3588,0,3588.0,1937
1,US,Sarah,M,25,0,25.0,1937
0,US,Sarah,M,47,1,22.0,1938
1,US,Sarah,F,7336,0,3748.0,1938
0,US,Sarah,F,11008,1,3672.0,1939
...,...,...,...,...,...,...,...
1,US,Sarah,F,924861,33,3060.0,2020
0,US,Sarah,M,2679,38,0.0,2021
1,US,Sarah,F,927716,34,2855.0,2021
0,US,Sarah,M,2679,39,0.0,2022


In [221]:
chart = alt.Chart(data).mark_circle().encode(
    x='age',
    y='thisyear_count',
    size='sum',
    order='year',
    tooltip=['name', 'M/F', 'year', 'age', 'thisyear_count', 'sum'],
    color=alt.condition(alt.datum['M/F'] == 'F', alt.value('#FF69B4'), alt.value('#85C1E9'))
).properties(
    width=450,
    height=400
)
chart