In [5]:
import numpy as np
import pandas as pd
from data_loader import load_names_from_web
import altair as alt
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [6]:
# Load data from the web
dfraw = load_names_from_web(category='national', hide_pre_1937=True, use_existing_files=True)

# Find the first and last known years in the dataset
first_known_year = dfraw['year'].min()
last_known_year = dfraw['year'].max()

# Filter the data to focus only on a specific name
dfraw = dfraw[dfraw['name'] == 'Jordan']

# Create a copy
all_years = dfraw.copy()

In [7]:
data = pd.DataFrame()

# Iterate over each year - first known year to the last known year
for year in range(first_known_year, last_known_year+1):

    df = all_years.copy()
    # Sort by year
    df = df[df['year'] <= year]
    df = df.sort_values(by='year')
    # Calculate cum sum
    df['cumsum'] = df.groupby(['name', 'M/F'])['count'].cumsum()
    # Calculate the total count
    df['sum'] = df.groupby(['name', 'M/F'])['count'].transform('sum')
    
    # Find the median age for each name and gender
    medians = df[df['cumsum'] >= df['sum']/2]
    medians = medians.drop_duplicates(subset=['name', 'M/F'], keep='first')
    medians['age'] = year - medians['year']
    
    thisyear = df[df['year'] == year][['name', 'M/F', 'count']].rename(columns={'count': 'thisyear_count'})
    df = medians.merge(thisyear, how='left', on=['name', 'M/F'])
    df['thisyear_count'] = df['thisyear_count'].fillna(0)
    df = df[['state', 'name', 'M/F', 'sum', 'age', 'thisyear_count']]
    
    # Add a column for the current year
    df['year'] = year
    # display(df)
    data = pd.concat([data, df])

data

Unnamed: 0,state,name,M/F,sum,age,thisyear_count,year
0,US,Jordan,M,71,0,71.0,1937
0,US,Jordan,M,136,1,65.0,1938
0,US,Jordan,M,203,1,67.0,1939
0,US,Jordan,M,260,2,57.0,1940
0,US,Jordan,M,323,2,63.0,1941
...,...,...,...,...,...,...,...
1,US,Jordan,M,381172,22,4274.0,2020
0,US,Jordan,F,133260,23,738.0,2021
1,US,Jordan,M,385178,23,4006.0,2021
0,US,Jordan,M,389056,24,3878.0,2022


In [10]:
# Create an Altair chart
chart = alt.Chart(data).mark_circle().encode(
    x='age',
    y='thisyear_count',
    size='sum',
    order='year',
    tooltip=['name', 'M/F', 'year', 'thisyear_count', 'age', 'sum'],
    color=alt.condition(alt.datum['M/F'] == 'F', alt.value('#FF69B4'), alt.value('#85C1E9'))
).properties(
    width=450,
    height=400
)
chart