# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Aggregate-data-at-year-and-sex-level" data-toc-modified-id="Aggregate-data-at-year-and-sex-level-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Aggregate data at year and sex level</a></div><div class="lev1 toc-item"><a href="#Add-a-prop-column" data-toc-modified-id="Add-a-prop-column-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Add a prop column</a></div><div class="lev1 toc-item"><a href="#Get-top-1000-names-for-each-sex/year-combination" data-toc-modified-id="Get-top-1000-names-for-each-sex/year-combination-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Get top 1000 names for each sex/year combination</a></div><div class="lev1 toc-item"><a href="#Analyzing-naming-trends" data-toc-modified-id="Analyzing-naming-trends-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Analyzing naming trends</a></div><div class="lev2 toc-item"><a href="#Measuring-the-increase-in-naming-diversity" data-toc-modified-id="Measuring-the-increase-in-naming-diversity-41"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Measuring the increase in naming diversity</a></div><div class="lev2 toc-item"><a href="#The-&quot;Last-letter&quot;-revolution" data-toc-modified-id="The-&quot;Last-letter&quot;-revolution-42"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>The "Last letter" revolution</a></div><div class="lev2 toc-item"><a href="#Boy-names-that-became-girl-names-(and-vice-versa)" data-toc-modified-id="Boy-names-that-became-girl-names-(and-vice-versa)-43"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Boy names that became girl names (and vice versa)</a></div>

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

years = range(2000, 2011)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    path = 'yob{}.txt'.format(year)
    df = pd.read_csv(path, names=columns)
    df['year'] = year
    pieces.append(df)
    # Concatenate everything into a single DataFrame
    names = pd.concat(pieces, ignore_index=True)

# Aggregate data at year and sex level

In [None]:
total_births = names.pivot_table('births', index='year', columns=['sex'], aggfunc=sum)
total_births.plot(title='Total births by sex and year')

# Add a prop column

In [None]:
def add_prop(group):
    births = group.births.astype(float)
    
    group['prop'] = births / births.sum()
    return group

names = names.groupby(['year', 'sex']).apply(add_prop)

# sanity check to verify the prop column sums to 1 within all the groups
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)


# Get top 1000 names for each sex/year combination

In [None]:
def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]

grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)

# Analyzing naming trends

In [None]:
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']

total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)

subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False, title='Number of births per year')

## Measuring the increase in naming diversity

In [None]:
table = top1000.pivot_table('prop', index='year', columns='sex', aggfunc=sum)

table.plot(title='Sum of table1000.prop by year and sex', 
           yticks=np.linspace(0, 1.2, 13), xticks=range(2000, 2010, 1))

In [None]:
df_2010 = boys[boys.year == 2010]

prop_cumsum = df_2010.sort_values(by='prop', ascending=False)['prop'].cumsum()
prop_cumsum.searchsorted(0.5)

df_2000 = boys[boys.year == 2000]

prop_cumsum = df_2000.sort_values(by='prop', ascending=False)['prop'].cumsum()
prop_cumsum.searchsorted(0.5)

def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group['prop'].cumsum().searchsorted(q)[0] + 1

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')

diversity.plot(title="Number of popular names in top 50%")

## The "Last letter" revolution

In [None]:
get_last_letter = lambda x: x[-1]
last_letters = names['name'].map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)
subtable = table.reindex(columns=[2000, 2005, 2010], level='year')

letter_prop = subtable / subtable.sum().astype(float)

fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)

In [None]:
letter_prop = table / table.sum().astype(float)

dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T

dny_ts.plot()

## Boy names that became girl names (and vice versa)

In [None]:
all_names = top1000.name.unique()

mask = np.array(['ama' in x.lower() for x in all_names])
lesley_like = all_names[mask]
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()

table = filtered.pivot_table('births', index='year', columns='sex', aggfunc='sum')
table = table.div(table.sum(1), axis=0)

table.plot(style={'M': 'k-', 'F': 'k--'})