In [1]:
#Import necessary libraries
import numpy as np
import pandas as pd
from data_loader import load_names_from_web, births_in_year, births_per_year
import altair as alt

#Disable max rows in Altair
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

### Load Data

In [2]:
dfraw = load_names_from_web(category='national', hide_pre_1937=False, use_existing_files=True)

#find first and last known year
first_known_year = dfraw['year'].min()
last_known_year = dfraw['year'].max()

# function to get births per year
births_by_year = births_per_year()

# dfraw = dfraw[dfraw['name'] == 'Sarah']

#copy data
all_years = dfraw.copy()

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'national_unzip_path'

In [None]:
# create empyt dataframe to store data
data = pd.DataFrame()

for year in range(first_known_year, last_known_year+1):

    print(year) #display current year
    
    #copy data for current year
    df = all_years.copy()
    # calculate cum sum and median age
    df = df[df['year'] <= year]
    df = df.sort_values(by='year')
    df['cumsum'] = df.groupby(['name', 'M/F'])['count'].cumsum()
    df['sum'] = df.groupby(['name', 'M/F'])['count'].transform('sum')
    medians = df[df['cumsum'] >= df['sum']/2]
    medians = medians.drop_duplicates(subset=['name', 'M/F'], keep='first')
    medians['age'] = year - medians['year']
    # calculate coutn for current year; merge with median data
    thisyear = df[df['year'] == year][['name', 'M/F', 'count']].rename(columns={'count': 'thisyear_count'})
    df = medians.merge(thisyear, how='left', on=['name', 'M/F'])
    df['thisyear_count'] = df['thisyear_count'].fillna(0)
    df['thisyear_count_adj'] = df['thisyear_count'].replace(0, 2)
    df['thisyear_rank'] = df.groupby('M/F')['thisyear_count'].rank(method='first', ascending=False)
    df['thisyear_pct'] = df.apply(lambda row: row['thisyear_count'] / births_by_year.at[year, row['M/F']], axis=1)
    df['thisyear_pct_adj'] = df.apply(lambda row: row['thisyear_count_adj'] / births_by_year.at[year, row['M/F']], axis=1)
    df = df[['state', 'name', 'M/F', 'sum', 'age', 'thisyear_count', 'thisyear_rank', 'thisyear_pct', 'thisyear_count_adj', 'thisyear_pct_adj']]
    df['year'] = year
    # concatenate processed data
    data = pd.concat([data, df])

# display dataframe 
data

In [None]:
#store backup data
backup = data.copy()

In [None]:
#filter data
data = backup.copy()
data = data[data['thisyear_rank'] <= 100]
data = data[data['year'] % 5 == 0]
data_F = data[data['M/F'] == 'F']
data_M = data[data['M/F'] == 'M']

#Create Altair charts for male/female data
chart_F = alt.Chart(data_F).mark_line().encode(
    x='thisyear_rank',
    y='thisyear_pct',
    tooltip=['name', 'M/F', 'year', 'age', 'thisyear_count', 'sum'],
    color='year'
).properties(
    width=450,
    height=400
)
chart_M = alt.Chart(data_M).mark_line().encode(
    x='thisyear_rank',
    y='thisyear_pct',
    tooltip=['name', 'M/F', 'year', 'age', 'thisyear_count', 'sum'],
    color=alt.Color('year', scale=alt.Scale(scheme='greys'))
).properties(
    width=450,
    height=400
)

#display charts
chart_F | chart_M

# ranks need to be cleaned up by gender

In [None]:
data = backup.copy()
data_F = data[data['M/F'] == 'F']
data_M = data[data['M/F'] == 'M']

#function to calcualte top shares
def top_shares(df):
    results = []
    groups = df.groupby('year')
    for year, group in groups:
        group = group.sort_values(by='thisyear_rank')
        for x in [1, 10, 100, 1000]:
            top_x = group.head(x)
            total_pct = top_x['thisyear_pct'].sum()
            results.append({'year': year, 'top_x': x, 'total_pct': total_pct})
    return pd.DataFrame(results)

#calculate top shared for male/female
data_F = top_shares(data_F)
data_M = top_shares(data_M)

#create Altair charts for top shares
chart_F = alt.Chart(data_F).mark_line().encode(
    x='year',
    y='total_pct',
    tooltip=['year', 'top_x', 'total_pct'],
    color=alt.Color('top_x', scale=alt.Scale(scheme='greys', reverse=True, type='log'))
).properties(
    width=450,
    height=400
)
chart_M = alt.Chart(data_M).mark_line().encode(
    x='year',
    y='total_pct',
    tooltip=['year', 'top_x', 'total_pct'],
    color='top_x'
).properties(
    width=450,
    height=400
)

#display charts
chart_F | chart_M

In [None]:
## Calculate top shares between male/female
data_diff = data_F.merge(data_M, how='inner', on=['year', 'top_x'], suffixes=('_f', '_m'))
data_diff['diff'] = data_diff['total_pct_f'] - data_diff['total_pct_m']

chart_diff = alt.Chart(data_diff).mark_line().encode(
    x='year',
    y='diff',
    tooltip=['year', 'top_x', 'diff'],
    color=alt.Color('top_x', scale=alt.Scale(scheme='greys', reverse=True, type='log'))
).properties(
    width=450,
    height=400
)

#display charts
chart_diff

In [None]:
#Calculating Shannon entropy for male/female
data = backup.copy()
data = data[data['thisyear_rank'] <= 100000]
data_F = data[data['M/F'] == 'F']
data_M = data[data['M/F'] == 'M']

#function to calculate Shannon entropy
def shannon_entropy(pcts):
    pcts = pcts[pcts > 0]
    return -np.sum(pcts * np.log2(pcts))

entropy_F = data_F.groupby('year')['thisyear_pct'].apply(shannon_entropy).reset_index().rename(columns={'thisyear_pct': 'entropy'})
entropy_M = data_M.groupby('year')['thisyear_pct'].apply(shannon_entropy).reset_index().rename(columns={'thisyear_pct': 'entropy'})
entropy_diff = pd.merge(entropy_F, entropy_M, how='inner', on='year', suffixes=('_f', '_m'))
entropy_diff['diff'] = entropy_diff['entropy_m'] - entropy_diff['entropy_f']

chart_F = alt.Chart(entropy_F).mark_line().encode(
    x='year',
    y='entropy',
    tooltip=['year', 'entropy'],
    color=alt.value('red')
).properties(
    width=450,
    height=400
)
chart_M = alt.Chart(entropy_M).mark_line().encode(
    x='year',
    y='entropy',
    tooltip=['year', 'entropy'],
    color=alt.value('blue')
).properties(
    width=450,
    height=400
)

#display charts
(chart_F | chart_M).resolve_scale(y='shared')

In [None]:
chart_M = alt.Chart(entropy_diff).mark_line().encode(
    x='year',
    y='diff',
    tooltip=['year', 'diff'],
    color=alt.value('grey')
).properties(
    width=450,
    height=400
)

#display chart
chart_M